SAMPLE TEST OF THE PROCESS¶
In [5]:
import os
import json
import folium
import pandas as pd
import numpy as np
from zipfile import ZipFile
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, HTML, Video
%matplotlib inline
# %matplotlib notebook
pd.set_option("float_format", '{:0.2f}'.format)
pd.set_option('display.max_columns', 30)
In [6]:
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
In [7]:
if IN_COLAB:
datasets_folder = './pvs-passive-vehicular-sensors-datasets/'
else:
datasets_folder = '../input/pvs-passive-vehicular-sensors-datasets/'
In [8]:
kaggle_json = {"username":"","key":""}
In [9]:
with open('./kaggle.json', 'w') as f:
json.dump(kaggle_json, f)
In [10]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
The syntax of the command is incorrect. 'cp' is not recognized as an internal or external command, operable program or batch file.
In [11]:
import sys
!{sys.executable} -m pip install --upgrade kaggle
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: kaggle in c:\users\sn161663\appdata\roaming\python\python313\site-packages (1.7.4.5) Requirement already satisfied: bleach in c:\programdata\anaconda3\lib\site-packages (from kaggle) (6.2.0) Requirement already satisfied: certifi>=14.05.14 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2025.4.26) Requirement already satisfied: charset-normalizer in c:\programdata\anaconda3\lib\site-packages (from kaggle) (3.3.2) Requirement already satisfied: idna in c:\programdata\anaconda3\lib\site-packages (from kaggle) (3.7) Requirement already satisfied: protobuf in c:\programdata\anaconda3\lib\site-packages (from kaggle) (5.29.3) Requirement already satisfied: python-dateutil>=2.5.3 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.9.0.post0) Requirement already satisfied: python-slugify in c:\programdata\anaconda3\lib\site-packages (from kaggle) (5.0.2) Requirement already satisfied: requests in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.32.3) Requirement already satisfied: setuptools>=21.0.0 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (72.1.0) Requirement already satisfied: six>=1.10 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (1.17.0) Requirement already satisfied: text-unidecode in c:\programdata\anaconda3\lib\site-packages (from kaggle) (1.3) Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from kaggle) (4.67.1) Requirement already satisfied: urllib3>=1.15.1 in c:\programdata\anaconda3\lib\site-packages (from kaggle) (2.3.0) Requirement already satisfied: webencodings in c:\programdata\anaconda3\lib\site-packages (from kaggle) (0.5.1) Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from tqdm->kaggle) (0.4.6)
In [12]:
import pathlib, shutil, os
src = pathlib.Path("kaggle.json")
assert src.exists(), "Place kaggle.json next to this notebook (or fix the path)."
kdir = pathlib.Path.home() / ".kaggle"
kdir.mkdir(exist_ok=True)
dst = kdir / "kaggle.json"
shutil.copyfile(src, dst)
try:
os.chmod(dst, 0o600)
except Exception as e:
print("chmod skipped:", e)
print("✓ kaggle.json installed at:", dst)
✓ kaggle.json installed at: C:\Users\sn161663\.kaggle\kaggle.json
In [13]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi(); api.authenticate()
print("✅ Kaggle authentication OK")
✅ Kaggle authentication OK
In [14]:
import os
os.environ["KAGGLE_USERNAME"] = "<your_kaggle_username>"
os.environ["KAGGLE_KEY"] = "<your_kaggle_api_key>"
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi(); api.authenticate()
print("✅ Kaggle authentication OK via env vars")
✅ Kaggle authentication OK via env vars
In [15]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
The syntax of the command is incorrect. 'cp' is not recognized as an internal or external command, operable program or batch file.
In [16]:
import pathlib, glob
DATA_DIR = pathlib.Path.cwd() / "PVS_dataset"
DATA_DIR.mkdir(exist_ok=True)
api.dataset_download_files(
dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets",
path=str(DATA_DIR),
unzip=False
)
zip_paths = glob.glob(str(DATA_DIR / "*.zip"))
print(zip_paths or "No ZIP found yet.")
Dataset URL: https://www.kaggle.com/datasets/jefmenegazzo/pvs-passive-vehicular-sensors-datasets
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[16], line 5 2 DATA_DIR = pathlib.Path.cwd() / "PVS_dataset" 3 DATA_DIR.mkdir(exist_ok=True) ----> 5 api.dataset_download_files( 6 dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets", 7 path=str(DATA_DIR), 8 unzip=False 9 ) 11 zip_paths = glob.glob(str(DATA_DIR / "*.zip")) 12 print(zip_paths or "No ZIP found yet.") File ~\AppData\Roaming\Python\Python313\site-packages\kaggle\api\kaggle_api_extended.py:1664, in KaggleApi.dataset_download_files(self, dataset, path, force, quiet, unzip, licenses) 1662 request.dataset_slug = dataset_slug 1663 request.dataset_version_number = dataset_version_number -> 1664 response = kaggle.datasets.dataset_api_client.download_dataset(request) 1666 outfile = os.path.join(effective_path, dataset_slug + '.zip') 1667 if force or self.download_needed(response, outfile, quiet): File ~\AppData\Roaming\Python\Python313\site-packages\kagglesdk\datasets\services\dataset_api_service.py:80, in DatasetApiClient.download_dataset(self, request) 77 if request is None: 78 request = ApiDownloadDatasetRequest() ---> 80 return self._client.call("datasets.DatasetApiService", "ApiDownloadDataset", request, HttpRedirect) File ~\AppData\Roaming\Python\Python313\site-packages\kagglesdk\kaggle_http_client.py:124, in KaggleHttpClient.call(self, service_name, request_name, request, response_type) 122 # Merge environment settings into session 123 settings = self._session.merge_environment_settings(http_request.url, {}, None, None, None) --> 124 http_response = self._session.send(http_request, **settings) 126 response = self._prepare_response(response_type, http_response) 127 return response File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:724, in Session.send(self, request, **kwargs) 721 if allow_redirects: 722 # Redirect resolving generator. 723 gen = self.resolve_redirects(r, request, **kwargs) --> 724 history = [resp for resp in gen] 725 else: 726 history = [] File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:265, in SessionRedirectMixin.resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs) 263 yield req 264 else: --> 265 resp = self.send( 266 req, 267 stream=stream, 268 timeout=timeout, 269 verify=verify, 270 cert=cert, 271 proxies=proxies, 272 allow_redirects=False, 273 **adapter_kwargs, 274 ) 276 extract_cookies_to_jar(self.cookies, prepared_request, resp.raw) 278 # extract redirect url, if any, for the next loop File C:\ProgramData\anaconda3\Lib\site-packages\requests\sessions.py:746, in Session.send(self, request, **kwargs) 743 pass 745 if not stream: --> 746 r.content 748 return r File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:902, in Response.content(self) 900 self._content = None 901 else: --> 902 self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b"" 904 self._content_consumed = True 905 # don't need to release the connection; that's been handled by urllib3 906 # since we exhausted the data. File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:820, in Response.iter_content.<locals>.generate() 818 if hasattr(self.raw, "stream"): 819 try: --> 820 yield from self.raw.stream(chunk_size, decode_content=True) 821 except ProtocolError as e: 822 raise ChunkedEncodingError(e) File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:1066, in HTTPResponse.stream(self, amt, decode_content) 1064 else: 1065 while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0: -> 1066 data = self.read(amt=amt, decode_content=decode_content) 1068 if data: 1069 yield data File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:955, in HTTPResponse.read(self, amt, decode_content, cache_content) 952 if len(self._decoded_buffer) >= amt: 953 return self._decoded_buffer.get(amt) --> 955 data = self._raw_read(amt) 957 flush_decoder = amt is None or (amt != 0 and not data) 959 if not data and len(self._decoded_buffer) == 0: File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:879, in HTTPResponse._raw_read(self, amt, read1) 876 fp_closed = getattr(self._fp, "closed", False) 878 with self._error_catcher(): --> 879 data = self._fp_read(amt, read1=read1) if not fp_closed else b"" 880 if amt is not None and amt != 0 and not data: 881 # Platform-specific: Buggy versions of Python. 882 # Close the connection when no data is returned (...) 887 # not properly close the connection in all cases. There is 888 # no harm in redundantly calling close. 889 self._fp.close() File C:\ProgramData\anaconda3\Lib\site-packages\urllib3\response.py:862, in HTTPResponse._fp_read(self, amt, read1) 859 return self._fp.read1(amt) if amt is not None else self._fp.read1() 860 else: 861 # StringIO doesn't like amt=None --> 862 return self._fp.read(amt) if amt is not None else self._fp.read() File C:\ProgramData\anaconda3\Lib\http\client.py:479, in HTTPResponse.read(self, amt) 476 if self.length is not None and amt > self.length: 477 # clip the read to the "end of response" 478 amt = self.length --> 479 s = self.fp.read(amt) 480 if not s and amt: 481 # Ideally, we would raise IncompleteRead if the content-length 482 # wasn't satisfied, but it might break compatibility. 483 self._close_conn() File C:\ProgramData\anaconda3\Lib\socket.py:719, in SocketIO.readinto(self, b) 717 raise OSError("cannot read from timed out object") 718 try: --> 719 return self._sock.recv_into(b) 720 except timeout: 721 self._timeout_occurred = True File C:\ProgramData\anaconda3\Lib\ssl.py:1304, in SSLSocket.recv_into(self, buffer, nbytes, flags) 1300 if flags != 0: 1301 raise ValueError( 1302 "non-zero flags not allowed in calls to recv_into() on %s" % 1303 self.__class__) -> 1304 return self.read(nbytes, buffer) 1305 else: 1306 return super().recv_into(buffer, nbytes, flags) File C:\ProgramData\anaconda3\Lib\ssl.py:1138, in SSLSocket.read(self, len, buffer) 1136 try: 1137 if buffer is not None: -> 1138 return self._sslobj.read(len, buffer) 1139 else: 1140 return self._sslobj.read(len) KeyboardInterrupt:
In [17]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
In [18]:
api.dataset_list_cli(search="jefmenegazzo/passive-vehicular-sensors-dataset-pvs")
ref title size lastUpdated downloadCount voteCount usabilityRating --------------------------------------------------- ---------------------------------------- ----------- -------------------------- ------------- --------- --------------- jefmenegazzo/pvs-passive-vehicular-sensors-datasets PVS - Passive Vehicular Sensors Datasets 44498315084 2021-01-27 20:26:04.113000 20832 78 0.9411765
In [19]:
if os.path.exists(datasets_folder):
shutil.rmtree(datasets_folder)
load_bar_datasets = tqdm(desc="Datasets Download", total=9)
load_bar_files = tqdm(desc="Files Download", total=3)
for dataset in range(1,10):
dataset_path = os.path.join(datasets_folder, "PVS " + str(dataset))
os.makedirs(dataset_path)
load_bar_files.reset()
for file in ["dataset_gps_mpu_left.csv", "dataset_gps_mpu_right.csv", "dataset_labels.csv"]:
dataset_kaggle = 'jefmenegazzo/pvs-passive-vehicular-sensors-datasets'
api.dataset_download_file(dataset=dataset_kaggle, file_name="PVS " + str(dataset) + "/" + file, path=dataset_path)
load_bar_files.update(1)
load_bar_datasets.update(1)
Datasets Download: 0%| | 0/9 [00:00<?, ?it/s]
Files Download: 0%| | 0/3 [00:00<?, ?it/s]
--------------------------------------------------------------------------- PermissionError Traceback (most recent call last) Cell In[19], line 9 7 for dataset in range(1,10): 8 dataset_path = os.path.join(datasets_folder, "PVS " + str(dataset)) ----> 9 os.makedirs(dataset_path) 10 load_bar_files.reset() 12 for file in ["dataset_gps_mpu_left.csv", "dataset_gps_mpu_right.csv", "dataset_labels.csv"]: File C:\ProgramData\anaconda3\Lib\os.py:218, in makedirs(name, mode, exist_ok) 216 if head and tail and not path.exists(head): 217 try: --> 218 makedirs(head, exist_ok=exist_ok) 219 except FileExistsError: 220 # Defeats race condition when another thread created the path 221 pass File C:\ProgramData\anaconda3\Lib\os.py:218, in makedirs(name, mode, exist_ok) 216 if head and tail and not path.exists(head): 217 try: --> 218 makedirs(head, exist_ok=exist_ok) 219 except FileExistsError: 220 # Defeats race condition when another thread created the path 221 pass File C:\ProgramData\anaconda3\Lib\os.py:228, in makedirs(name, mode, exist_ok) 226 return 227 try: --> 228 mkdir(name, mode) 229 except OSError: 230 # Cannot rely on checking for EEXIST, since the operating system 231 # could give priority to other errors like EACCES or EROFS 232 if not exist_ok or not path.isdir(name): PermissionError: [WinError 5] Access is denied: '../input'
In [ ]:
from kaggle.api.kaggle_api_extended import KaggleApi
import pathlib, glob
DATA_DIR = pathlib.Path.cwd() / "PVS_dataset"
SUBSET_DIR = pathlib.Path.cwd() / "PVS_subset"
DATA_DIR.mkdir(exist_ok=True); SUBSET_DIR.mkdir(exist_ok=True)
api = KaggleApi(); api.authenticate()
api.dataset_download_files(
dataset="jefmenegazzo/pvs-passive-vehicular-sensors-datasets",
path=str(DATA_DIR),
unzip=False
)
ZIP_PATH = pathlib.Path(glob.glob(str(DATA_DIR/"*.zip"))[0])
Dataset URL: https://www.kaggle.com/datasets/jefmenegazzo/pvs-passive-vehicular-sensors-datasets
In [2]:
pvs-passive-vehicular-sensors-datasets.zip
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 1 ----> 1 pvs-passive-vehicular-sensors-datasets.zip NameError: name 'pvs' is not defined
In [3]:
Accident_Detection_Project_Dataset.zip
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 1 ----> 1 Accident_Detection_Project_Dataset.zip NameError: name 'Accident_Detection_Project_Dataset' is not defined
In [4]:
import pathlib, os, glob, pandas as pd
BASE_PATH = pathlib.Path(r"C:\Users\<sn161663>\Desktop\Accident_Detection_Project_Dataset")
print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
--------------------------------------------------------------------------- OSError Traceback (most recent call last) Cell In[4], line 5 1 import pathlib, os, glob, pandas as pd 3 BASE_PATH = pathlib.Path(r"C:\Users\<sn161663>\Desktop\Accident_Detection_Project_Dataset") ----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()]) File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self) 569 """Yield path objects of the directory contents. 570 571 The children are yielded in arbitrary order, and the 572 special entries '.' and '..' are not included. 573 """ 574 root_dir = str(self) --> 575 with os.scandir(root_dir) as scandir_it: 576 paths = [entry.path for entry in scandir_it] 577 if root_dir == '.': OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\Users\\<sn161663>\\Desktop\\Accident_Detection_Project_Dataset'
In [5]:
import pathlib, os, glob, pandas as pd
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[5], line 5 1 import pathlib, os, glob, pandas as pd 3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset") ----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()]) File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self) 569 """Yield path objects of the directory contents. 570 571 The children are yielded in arbitrary order, and the 572 special entries '.' and '..' are not included. 573 """ 574 root_dir = str(self) --> 575 with os.scandir(root_dir) as scandir_it: 576 paths = [entry.path for entry in scandir_it] 577 if root_dir == '.': FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset'
In [7]:
import pathlib, os, glob, pandas as pd
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")
print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
--------------------------------------------------------------------------- NotADirectoryError Traceback (most recent call last) Cell In[7], line 5 1 import pathlib, os, glob, pandas as pd 3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip") ----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()]) File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self) 569 """Yield path objects of the directory contents. 570 571 The children are yielded in arbitrary order, and the 572 special entries '.' and '..' are not included. 573 """ 574 root_dir = str(self) --> 575 with os.scandir(root_dir) as scandir_it: 576 paths = [entry.path for entry in scandir_it] 577 if root_dir == '.': NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset.zip'
In [8]:
import pathlib, os, pandas as pd, glob
# paste your own full path below exactly as you copied it:
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\OneDrive\Desktop\Accident_Detection_Project_Dataset")
# verify it exists
print("Exists?", BASE_PATH.exists())
Exists? False
In [9]:
import pathlib, os, pandas as pd, glob
# paste your own full path below exactly as you copied it:
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")
# verify it exists
print("Exists?", BASE_PATH.exists())
Exists? True
In [10]:
import pathlib, os, glob, pandas as pd
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip")
print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
--------------------------------------------------------------------------- NotADirectoryError Traceback (most recent call last) Cell In[10], line 5 1 import pathlib, os, glob, pandas as pd 3 BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset.zip") ----> 5 print("Folders inside:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()]) File C:\ProgramData\anaconda3\Lib\pathlib\_local.py:575, in Path.iterdir(self) 569 """Yield path objects of the directory contents. 570 571 The children are yielded in arbitrary order, and the 572 special entries '.' and '..' are not included. 573 """ 574 root_dir = str(self) --> 575 with os.scandir(root_dir) as scandir_it: 576 paths = [entry.path for entry in scandir_it] 577 if root_dir == '.': NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset.zip'
In [11]:
import pathlib, glob, pandas as pd, os
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
print("Exists?", BASE_PATH.exists())
print("Sub-folders:", [f.name for f in BASE_PATH.iterdir() if f.is_dir()])
Exists? True Sub-folders: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
In [12]:
csvs = sorted(glob.glob(str(BASE_PATH / "**/*.csv"), recursive=True))
print("Total CSV files found:", len(csvs))
print("First few files:")
for f in csvs[:10]:
print(f)
Total CSV files found: 72 First few files: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_left.csv
In [13]:
sample = csvs[0]
print("Sample file:", sample)
df = pd.read_csv(sample)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()
Sample file: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv Shape: (1467, 20) Columns: ['timestamp', 'latitude', 'longitude', 'elevation', 'accuracy', 'bearing', 'speed_meters_per_second', 'satellites', 'provider', 'hdop', 'vdop', 'pdop', 'geoidheight', 'ageofdgpsdata', 'dgpsid', 'activity', 'battery', 'annotation', 'distance_meters', 'elapsed_time_seconds']
Out[13]:
| timestamp | latitude | longitude | elevation | accuracy | bearing | speed_meters_per_second | satellites | provider | hdop | vdop | pdop | geoidheight | ageofdgpsdata | dgpsid | activity | battery | annotation | distance_meters | elapsed_time_seconds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.577219e+09 | -27.717812 | -51.098895 | 948.770836 | 24.0 | 159.73294 | 0.053275 | 0 | gps | 0.8 | 1.7 | 1.9 | 3.6 | NaN | NaN | NaN | 87 | NaN | 0.000000 | 0.0 |
| 1 | 1.577219e+09 | -27.717818 | -51.098840 | 970.378820 | 12.0 | NaN | 0.000000 | 12 | gps | 0.8 | 1.5 | 1.7 | 3.6 | NaN | NaN | NaN | 87 | NaN | 5.442520 | 2.0 |
| 2 | 1.577219e+09 | -27.717832 | -51.098871 | 989.374267 | 4.0 | NaN | 0.000000 | 13 | gps | 0.8 | 1.6 | 1.8 | 3.6 | NaN | NaN | NaN | 86 | NaN | 3.404871 | 5.0 |
| 3 | 1.577219e+09 | -27.717833 | -51.098867 | 988.439139 | 4.0 | NaN | 0.000000 | 14 | gps | 0.8 | 1.7 | 1.9 | 3.6 | NaN | NaN | NaN | 86 | NaN | 0.421733 | 1.0 |
| 4 | 1.577219e+09 | -27.717835 | -51.098873 | 987.668730 | 4.0 | NaN | 0.000000 | 14 | gps | 0.8 | 1.6 | 1.8 | 3.6 | NaN | NaN | NaN | 86 | NaN | 0.574281 | 1.0 |
In [14]:
import numpy as np
wanted_cols = ['timestamp','ax','ay','az','gx','gy','gz','lat','lon','speed'] # adjust based on step 3
def safe_read(path, wanted):
cols = pd.read_csv(path, nrows=0).columns.tolist()
use = [c for c in wanted if c in cols]
df = pd.read_csv(path, usecols=use)
# downcast numeric types to save RAM
for c in df.select_dtypes(include=['float64','int64']).columns:
df[c] = pd.to_numeric(df[c], downcast='float')
df["folder"] = os.path.basename(os.path.dirname(path))
df["file"] = os.path.basename(path)
return df
frames = []
for p in csvs:
try:
frames.append(safe_read(p, wanted_cols))
except Exception as e:
print("Skipping", p, ":", e)
df_all = pd.concat(frames, ignore_index=True, sort=False)
print("Combined shape:", df_all.shape)
df_all.head()
Combined shape: (4335374, 4)
Out[14]:
| timestamp | folder | file | speed | |
|---|---|---|---|---|
| 0 | 1.577219e+09 | PVS 1 | dataset_gps.csv | NaN |
| 1 | 1.577219e+09 | PVS 1 | dataset_gps.csv | NaN |
| 2 | 1.577219e+09 | PVS 1 | dataset_gps.csv | NaN |
| 3 | 1.577219e+09 | PVS 1 | dataset_gps.csv | NaN |
| 4 | 1.577219e+09 | PVS 1 | dataset_gps.csv | NaN |
In [15]:
out_path = BASE_PATH / "accident_subset.csv"
df_all.to_csv(out_path, index=False)
print("✅ Saved working subset to:", out_path)
✅ Saved working subset to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\accident_subset.csv
In [16]:
df = pd.read_csv(out_path)
In [17]:
print(df.info())
print(df.describe().T)
print("Missing values:\n", df.isnull().sum())
# Quick plot example
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
df['ax'].sample(min(50000, len(df['ax'].dropna()))).hist(bins=60)
plt.title("Distribution of ax"); plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4335374 entries, 0 to 4335373
Data columns (total 4 columns):
# Column Dtype
--- ------ -----
0 timestamp float64
1 folder object
2 file object
3 speed float64
dtypes: float64(2), object(2)
memory usage: 132.3+ MB
None
count mean std min 25% \
timestamp 4335374.0 1.577307e+09 71719.256410 1.577219e+09 1.577223e+09
speed 2161810.0 9.983220e+00 7.465368 0.000000e+00 4.719659e+00
50% 75% max
timestamp 1.577309e+09 1.577396e+09 1.577400e+09
speed 7.074296e+00 1.664852e+01 2.854857e+01
Missing values:
timestamp 0
folder 0
file 0
speed 2173564
dtype: int64
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key) 3804 try: -> 3805 return self._engine.get_loc(casted_key) 3806 except KeyError as err: File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'ax' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[17], line 8 6 import matplotlib.pyplot as plt 7 plt.figure(figsize=(10,4)) ----> 8 df['ax'].sample(min(50000, len(df['ax'].dropna()))).hist(bins=60) 9 plt.title("Distribution of ax"); plt.show() File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key) 4100 if self.columns.nlevels > 1: 4101 return self._getitem_multilevel(key) -> 4102 indexer = self.columns.get_loc(key) 4103 if is_integer(indexer): 4104 indexer = [indexer] File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key) 3807 if isinstance(casted_key, slice) or ( 3808 isinstance(casted_key, abc.Iterable) 3809 and any(isinstance(x, slice) for x in casted_key) 3810 ): 3811 raise InvalidIndexError(key) -> 3812 raise KeyError(key) from err 3813 except TypeError: 3814 # If we have a listlike key, _check_indexing_error will raise 3815 # InvalidIndexError. Otherwise we fall through and re-raise 3816 # the TypeError. 3817 self._check_indexing_error(key) KeyError: 'ax'
<Figure size 1000x400 with 0 Axes>
In [18]:
print("Columns in your dataframe:\n", df.columns.tolist())
Columns in your dataframe: ['timestamp', 'folder', 'file', 'speed']
In [19]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
df['speed'].dropna().sample(min(50000, len(df['speed'].dropna()))).hist(bins=60)
plt.title("Distribution of Speed")
plt.xlabel("Speed")
plt.ylabel("Frequency")
plt.show()
In [20]:
plt.figure(figsize=(12,5))
plt.plot(df['timestamp'], df['speed'], color='blue', linewidth=0.5)
plt.title("Speed over Time")
plt.xlabel("Timestamp")
plt.ylabel("Speed")
plt.show()
In [21]:
import pandas as pd
df['time'] = pd.to_datetime(df['timestamp'], unit='s')
plt.figure(figsize=(12,5))
plt.plot(df['time'], df['speed'], linewidth=0.5)
plt.title("Speed over Time (datetime)")
plt.xlabel("Time")
plt.ylabel("Speed")
plt.show()
In [22]:
import glob, os
csvs = glob.glob(str(BASE_PATH / "**/*.csv"), recursive=True)
for p in csvs[:20]:
print(p)
C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\accident_subset.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 1\dataset_settings_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_gps_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_labels.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_mpu_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_settings_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 2\dataset_settings_right.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps_mpu_left.csv C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\PVS 3\dataset_gps_mpu_right.csv
In [23]:
df_test = pd.read_csv(csvs[0])
print(df_test.columns)
Index(['timestamp', 'folder', 'file', 'speed'], dtype='object')
STEP 2: DATA UNDERSTANDING¶
Inventory¶
In [24]:
import pathlib, os, glob, pandas as pd
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
assert BASE_PATH.exists(), f"Path not found: {BASE_PATH}"
sessions = [p for p in BASE_PATH.iterdir() if p.is_dir()]
print("Sessions:", [s.name for s in sessions])
rows = []
for s in sessions:
csvs = sorted(glob.glob(str(s / "*.csv")))
for f in csvs:
try:
size_mb = os.path.getsize(f)/1e6
except:
size_mb = None
rows.append({"session": s.name, "file_name": os.path.basename(f), "path": f, "size_MB": round(size_mb,2)})
inv = pd.DataFrame(rows).sort_values(["session","file_name"]).reset_index(drop=True)
display(inv.head(20))
inv.to_csv(BASE_PATH/"reports_data_inventory.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_data_inventory.csv")
Sessions: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
| session | file_name | path | size_MB | |
|---|---|---|---|---|
| 0 | PVS 1 | dataset_gps.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.22 |
| 1 | PVS 1 | dataset_gps_mpu_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 84.48 |
| 2 | PVS 1 | dataset_gps_mpu_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 84.70 |
| 3 | PVS 1 | dataset_labels.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 4.18 |
| 4 | PVS 1 | dataset_mpu_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 75.58 |
| 5 | PVS 1 | dataset_mpu_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 75.79 |
| 6 | PVS 1 | dataset_settings_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.00 |
| 7 | PVS 1 | dataset_settings_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.00 |
| 8 | PVS 2 | dataset_gps.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.23 |
| 9 | PVS 2 | dataset_gps_mpu_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 72.72 |
| 10 | PVS 2 | dataset_gps_mpu_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 73.07 |
| 11 | PVS 2 | dataset_labels.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 3.62 |
| 12 | PVS 2 | dataset_mpu_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 65.09 |
| 13 | PVS 2 | dataset_mpu_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 65.44 |
| 14 | PVS 2 | dataset_settings_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.00 |
| 15 | PVS 2 | dataset_settings_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.00 |
| 16 | PVS 3 | dataset_gps.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 0.20 |
| 17 | PVS 3 | dataset_gps_mpu_left.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 62.57 |
| 18 | PVS 3 | dataset_gps_mpu_right.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 62.45 |
| 19 | PVS 3 | dataset_labels.csv | C:\Users\sn161663\Desktop\Accident_Detection_P... | 3.07 |
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_data_inventory.csv
Schema Discovery¶
In [25]:
import pandas as pd
schema_rows = []
for _, r in inv.iterrows():
path = r["path"]
try:
cols = pd.read_csv(path, nrows=0).columns.tolist()
dtypes = pd.read_csv(path, nrows=1000).infer_objects().dtypes.astype(str).to_dict()
schema_rows.append({
"session": r["session"],
"file_name": r["file_name"],
"columns": "|".join(cols),
"example_dtypes": "|".join([f"{k}:{v}" for k,v in dtypes.items()])
})
except Exception as e:
schema_rows.append({"session": r["session"], "file_name": r["file_name"], "columns": f"ERROR: {e}", "example_dtypes": ""})
schema = pd.DataFrame(schema_rows)
display(schema.head(20))
schema.to_csv(BASE_PATH/"reports_schema_overview.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_schema_overview.csv")
| session | file_name | columns | example_dtypes | |
|---|---|---|---|---|
| 0 | PVS 1 | dataset_gps.csv | timestamp|latitude|longitude|elevation|accurac... | timestamp:float64|latitude:float64|longitude:f... |
| 1 | PVS 1 | dataset_gps_mpu_left.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 2 | PVS 1 | dataset_gps_mpu_right.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 3 | PVS 1 | dataset_labels.csv | paved_road|unpaved_road|dirt_road|cobblestone_... | paved_road:int64|unpaved_road:int64|dirt_road:... |
| 4 | PVS 1 | dataset_mpu_left.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 5 | PVS 1 | dataset_mpu_right.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 6 | PVS 1 | dataset_settings_left.csv | placement|address_mpu|address_ak|gyroscope_ful... | placement:object|address_mpu:object|address_ak... |
| 7 | PVS 1 | dataset_settings_right.csv | placement|address_mpu|address_ak|gyroscope_ful... | placement:object|address_mpu:object|address_ak... |
| 8 | PVS 2 | dataset_gps.csv | timestamp|latitude|longitude|elevation|accurac... | timestamp:float64|latitude:float64|longitude:f... |
| 9 | PVS 2 | dataset_gps_mpu_left.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 10 | PVS 2 | dataset_gps_mpu_right.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 11 | PVS 2 | dataset_labels.csv | paved_road|unpaved_road|dirt_road|cobblestone_... | paved_road:int64|unpaved_road:int64|dirt_road:... |
| 12 | PVS 2 | dataset_mpu_left.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 13 | PVS 2 | dataset_mpu_right.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 14 | PVS 2 | dataset_settings_left.csv | placement|address_mpu|address_ak|gyroscope_ful... | placement:object|address_mpu:object|address_ak... |
| 15 | PVS 2 | dataset_settings_right.csv | placement|address_mpu|address_ak|gyroscope_ful... | placement:object|address_mpu:object|address_ak... |
| 16 | PVS 3 | dataset_gps.csv | timestamp|latitude|longitude|elevation|accurac... | timestamp:float64|latitude:float64|longitude:f... |
| 17 | PVS 3 | dataset_gps_mpu_left.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 18 | PVS 3 | dataset_gps_mpu_right.csv | timestamp|acc_x_dashboard|acc_y_dashboard|acc_... | timestamp:float64|acc_x_dashboard:float64|acc_... |
| 19 | PVS 3 | dataset_labels.csv | paved_road|unpaved_road|dirt_road|cobblestone_... | paved_road:int64|unpaved_road:int64|dirt_road:... |
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_schema_overview.csv
Sampling rate & timestamp health¶
In [26]:
import numpy as np
def quick_time_health(csv_path, ts_col="timestamp", n_rows=300_000):
try:
use = pd.read_csv(csv_path, usecols=[ts_col], nrows=n_rows)
except:
return {"has_timestamp": False, "monotonic": None, "median_dt_ms": None, "notes": "timestamp missing or unreadable"}
ts = use[ts_col].dropna().values
if len(ts) < 3:
return {"has_timestamp": True, "monotonic": None, "median_dt_ms": None, "notes": "too few timestamps"}
diffs = np.diff(ts)
mono = np.all(diffs >= 0)
median_dt = np.median(diffs) * 1000 if np.median(diffs) < 10 else np.median(diffs) # adjust if ts is in seconds
return {"has_timestamp": True, "monotonic": bool(mono), "median_dt_ms": round(float(median_dt),2), "notes": ""}
health = []
for _, r in inv.iterrows():
h = quick_time_health(r["path"])
h.update({"session": r["session"], "file_name": r["file_name"]})
health.append(h)
ts_health = pd.DataFrame(health)
display(ts_health.head(20))
ts_health.to_csv(BASE_PATH/"reports_timestamp_health.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_timestamp_health.csv")
| has_timestamp | monotonic | median_dt_ms | notes | session | file_name | |
|---|---|---|---|---|---|---|
| 0 | True | True | 1000.0 | PVS 1 | dataset_gps.csv | |
| 1 | True | True | 10.0 | PVS 1 | dataset_gps_mpu_left.csv | |
| 2 | True | True | 10.0 | PVS 1 | dataset_gps_mpu_right.csv | |
| 3 | False | None | NaN | timestamp missing or unreadable | PVS 1 | dataset_labels.csv |
| 4 | True | True | 10.0 | PVS 1 | dataset_mpu_left.csv | |
| 5 | True | True | 10.0 | PVS 1 | dataset_mpu_right.csv | |
| 6 | False | None | NaN | timestamp missing or unreadable | PVS 1 | dataset_settings_left.csv |
| 7 | False | None | NaN | timestamp missing or unreadable | PVS 1 | dataset_settings_right.csv |
| 8 | True | True | 1000.0 | PVS 2 | dataset_gps.csv | |
| 9 | True | True | 10.0 | PVS 2 | dataset_gps_mpu_left.csv | |
| 10 | True | True | 10.0 | PVS 2 | dataset_gps_mpu_right.csv | |
| 11 | False | None | NaN | timestamp missing or unreadable | PVS 2 | dataset_labels.csv |
| 12 | True | True | 10.0 | PVS 2 | dataset_mpu_left.csv | |
| 13 | True | True | 10.0 | PVS 2 | dataset_mpu_right.csv | |
| 14 | False | None | NaN | timestamp missing or unreadable | PVS 2 | dataset_settings_left.csv |
| 15 | False | None | NaN | timestamp missing or unreadable | PVS 2 | dataset_settings_right.csv |
| 16 | True | True | 1000.0 | PVS 3 | dataset_gps.csv | |
| 17 | True | True | 10.0 | PVS 3 | dataset_gps_mpu_left.csv | |
| 18 | True | True | 10.0 | PVS 3 | dataset_gps_mpu_right.csv | |
| 19 | False | None | NaN | timestamp missing or unreadable | PVS 3 | dataset_labels.csv |
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_timestamp_health.csv
Data Quality¶
In [28]:
from collections import defaultdict
def safe_numeric_summary(path, nrows=300_000):
try:
df = pd.read_csv(path, nrows=nrows)
except Exception as e:
return {"error": str(e)}
summary = {}
for c in df.columns:
if pd.api.types.is_numeric_dtype(df[c]):
col = df[c].dropna()
if len(col) == 0:
summary[c] = {"count": 0, "missing": int(df[c].isna().sum())}
else:
summary[c] = {
"count": int(col.shape[0]),
"missing": int(df[c].isna().sum()),
"min": float(col.min()),
"max": float(col.max()),
"mean": float(col.mean()),
"std": float(col.std()) if col.shape[0] > 1 else 0.0
}
return summary
sum_rows = []
for _, r in inv.iterrows():
s = safe_numeric_summary(r["path"])
if "error" in s:
sum_rows.append({"session": r["session"], "file_name": r["file_name"], "metric": "error", "value": s["error"], "column": ""})
else:
for col, stats in s.items():
for k,v in stats.items():
sum_rows.append({"session": r["session"], "file_name": r["file_name"], "column": col, "metric": k, "value": v})
num_summary = pd.DataFrame(sum_rows)
display(num_summary.head(30))
num_summary.to_csv(BASE_PATH/"reports_numeric_summary_sampled.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_numeric_summary_sampled.csv")
| session | file_name | column | metric | value | |
|---|---|---|---|---|---|
| 0 | PVS 1 | dataset_gps.csv | timestamp | count | 1.467000e+03 |
| 1 | PVS 1 | dataset_gps.csv | timestamp | missing | 0.000000e+00 |
| 2 | PVS 1 | dataset_gps.csv | timestamp | min | 1.577219e+09 |
| 3 | PVS 1 | dataset_gps.csv | timestamp | max | 1.577220e+09 |
| 4 | PVS 1 | dataset_gps.csv | timestamp | mean | 1.577219e+09 |
| 5 | PVS 1 | dataset_gps.csv | timestamp | std | 4.307223e+02 |
| 6 | PVS 1 | dataset_gps.csv | latitude | count | 1.467000e+03 |
| 7 | PVS 1 | dataset_gps.csv | latitude | missing | 0.000000e+00 |
| 8 | PVS 1 | dataset_gps.csv | latitude | min | -2.771784e+01 |
| 9 | PVS 1 | dataset_gps.csv | latitude | max | -2.768182e+01 |
| 10 | PVS 1 | dataset_gps.csv | latitude | mean | -2.769508e+01 |
| 11 | PVS 1 | dataset_gps.csv | latitude | std | 1.175118e-02 |
| 12 | PVS 1 | dataset_gps.csv | longitude | count | 1.467000e+03 |
| 13 | PVS 1 | dataset_gps.csv | longitude | missing | 0.000000e+00 |
| 14 | PVS 1 | dataset_gps.csv | longitude | min | -5.113269e+01 |
| 15 | PVS 1 | dataset_gps.csv | longitude | max | -5.109884e+01 |
| 16 | PVS 1 | dataset_gps.csv | longitude | mean | -5.111933e+01 |
| 17 | PVS 1 | dataset_gps.csv | longitude | std | 1.137600e-02 |
| 18 | PVS 1 | dataset_gps.csv | elevation | count | 1.467000e+03 |
| 19 | PVS 1 | dataset_gps.csv | elevation | missing | 0.000000e+00 |
| 20 | PVS 1 | dataset_gps.csv | elevation | min | 8.748351e+02 |
| 21 | PVS 1 | dataset_gps.csv | elevation | max | 9.959747e+02 |
| 22 | PVS 1 | dataset_gps.csv | elevation | mean | 9.255807e+02 |
| 23 | PVS 1 | dataset_gps.csv | elevation | std | 4.069137e+01 |
| 24 | PVS 1 | dataset_gps.csv | accuracy | count | 1.467000e+03 |
| 25 | PVS 1 | dataset_gps.csv | accuracy | missing | 0.000000e+00 |
| 26 | PVS 1 | dataset_gps.csv | accuracy | min | 4.000000e+00 |
| 27 | PVS 1 | dataset_gps.csv | accuracy | max | 2.400000e+01 |
| 28 | PVS 1 | dataset_gps.csv | accuracy | mean | 4.092706e+00 |
| 29 | PVS 1 | dataset_gps.csv | accuracy | std | 6.747199e-01 |
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_numeric_summary_sampled.csv
Label Reconnaissance¶
In [29]:
possible_label_names = {"label","labels","class","target","event","accident"}
label_hits = []
for _, r in inv.iterrows():
try:
cols = pd.read_csv(r["path"], nrows=0).columns
found = [c for c in cols if c.lower() in possible_label_names]
if found:
label_hits.append({"session": r["session"], "file_name": r["file_name"], "label_cols": "|".join(found)})
except:
pass
label_index = pd.DataFrame(label_hits)
display(label_index if not label_index.empty else "No explicit label columns found.")
if not label_index.empty:
label_index.to_csv(BASE_PATH/"reports_label_columns.csv", index=False)
'No explicit label columns found.'
Numeric Data Quality Checks
In [30]:
import numpy as np
def sample_numeric_stats(path, nrows=200_000):
try:
df = pd.read_csv(path, nrows=nrows)
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
summary = df[numeric_cols].describe().T
summary["missing_%"] = df[numeric_cols].isna().mean()*100
summary["file"] = os.path.basename(path)
return summary
except Exception as e:
return pd.DataFrame({"error":[str(e)], "file":[path]})
numeric_summaries = []
for _, r in inv.iterrows():
numeric_summaries.append(sample_numeric_stats(r["path"]))
num_df = pd.concat(numeric_summaries, ignore_index=True, sort=False)
display(num_df.head(10))
num_df.to_csv(BASE_PATH/"reports_numeric_overview.csv", index=False)
print("✓ Saved:", BASE_PATH/"reports_numeric_overview.csv")
| count | mean | std | min | 25% | 50% | 75% | max | missing_% | file | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1467.0 | 1.577219e+09 | 430.722270 | 1.577219e+09 | 1.577219e+09 | 1.577219e+09 | 1.577220e+09 | 1.577220e+09 | 0.000000 | dataset_gps.csv |
| 1 | 1467.0 | -2.769508e+01 | 0.011751 | -2.771784e+01 | -2.770213e+01 | -2.768987e+01 | -2.768708e+01 | -2.768182e+01 | 0.000000 | dataset_gps.csv |
| 2 | 1467.0 | -5.111933e+01 | 0.011376 | -5.113269e+01 | -5.112895e+01 | -5.112469e+01 | -5.110955e+01 | -5.109884e+01 | 0.000000 | dataset_gps.csv |
| 3 | 1467.0 | 9.255807e+02 | 40.691374 | 8.748351e+02 | 8.890266e+02 | 9.084693e+02 | 9.615848e+02 | 9.959747e+02 | 0.000000 | dataset_gps.csv |
| 4 | 1467.0 | 4.092706e+00 | 0.674720 | 4.000000e+00 | 4.000000e+00 | 4.000000e+00 | 4.000000e+00 | 2.400000e+01 | 0.000000 | dataset_gps.csv |
| 5 | 1458.0 | 2.136291e+02 | 95.640711 | 1.006545e+00 | 1.382972e+02 | 1.973806e+02 | 3.171247e+02 | 3.597907e+02 | 0.613497 | dataset_gps.csv |
| 6 | 1467.0 | 9.286443e+00 | 7.820981 | 0.000000e+00 | 4.271836e+00 | 6.516178e+00 | 1.495587e+01 | 2.687448e+01 | 0.000000 | dataset_gps.csv |
| 7 | 1467.0 | 1.506748e+01 | 0.666034 | 0.000000e+00 | 1.500000e+01 | 1.500000e+01 | 1.500000e+01 | 1.600000e+01 | 0.000000 | dataset_gps.csv |
| 8 | 1467.0 | 8.117928e-01 | 0.037540 | 8.000000e-01 | 8.000000e-01 | 8.000000e-01 | 8.000000e-01 | 1.100000e+00 | 0.000000 | dataset_gps.csv |
| 9 | 1467.0 | 1.437832e+00 | 0.154977 | 1.000000e+00 | 1.300000e+00 | 1.500000e+00 | 1.600000e+00 | 2.100000e+00 | 0.000000 | dataset_gps.csv |
✓ Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\reports_numeric_overview.csv
Sample Visualizations - Numeric Distribution¶
In [31]:
import matplotlib.pyplot as plt
sample_path = inv.iloc[0]["path"]
df_sample = pd.read_csv(sample_path, nrows=100_000)
numeric_cols = [c for c in df_sample.columns if pd.api.types.is_numeric_dtype(df_sample[c]) and c!="timestamp"]
for c in numeric_cols[:5]:
plt.figure(figsize=(8,3))
df_sample[c].hist(bins=60)
plt.title(f"Distribution of {c}")
plt.xlabel(c)
plt.ylabel("Frequency")
plt.show()
Sample Visualizations - Short Time-Series Preview¶
In [33]:
if "speed" in df_sample.columns:
plt.figure(figsize=(12,4))
plt.plot(df_sample["speed"].iloc[:10000])
plt.title("Speed – First 10k samples")
plt.xlabel("Sample index")
plt.ylabel("Speed")
plt.show()
# if accel axes exist
axes = [a for a in ["ax","ay","az"] if a in df_sample.columns]
if len(axes)==3:
accel_mag = np.sqrt((df_sample[axes]**2).sum(axis=1))
plt.figure(figsize=(12,4))
plt.plot(accel_mag.iloc[:10000])
plt.title("Acceleration Magnitude – First 10k samples")
plt.xlabel("Sample index")
plt.ylabel("|a|")
plt.show()
GPS & Spatial Sanity¶
In [34]:
if {"lat","lon"}.issubset(df_sample.columns):
lat_ok = df_sample["lat"].between(-90,90).mean()
lon_ok = df_sample["lon"].between(-180,180).mean()
print(f"Latitude valid ratio: {lat_ok:.3f}, Longitude valid ratio: {lon_ok:.3f}")
Preliminary Correlation Check¶
In [35]:
import seaborn as sns
numeric_cols = [c for c in df_sample.columns if pd.api.types.is_numeric_dtype(df_sample[c])]
plt.figure(figsize=(10,6))
sns.heatmap(df_sample[numeric_cols].corr(), cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap (sample)")
plt.show()
STEP 3: DATA PREPARATION¶
Setup - Folders + Configuration¶
In [22]:
import pathlib, json, os, glob
import numpy as np
import pandas as pd
# 1) Point to your extracted folder that contains: PVS 1, PVS 2, ... PVS 9
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
assert BASE_PATH.exists(), f"Path not found: {BASE_PATH}"
# 2) Working folders
WORK = BASE_PATH / "work"
CLEAN = WORK / "clean_resampled" # synchronized, filtered per session
WINDOWS= WORK / "windows" # feature windows per session
for d in (WORK, CLEAN, WINDOWS):
d.mkdir(parents=True, exist_ok=True)
# 3) Data-prep configuration (tune later if needed)
CFG = {
"resample_hz": 50, # common rate (Hz)
"lowpass_hz": 15, # accel/gyro low-pass cutoff (Hz)
"window_sec": 3.0, # sliding window length
"overlap": 0.5, # 50% overlap
# Proxy-label thresholds (units: see notes below)
"accel_mag_g": 1.8, # ≈1.8 g (if your accel is already m/s^2, multiply by 9.81 later)
"gyro_mag_dps": 250, # deg/s
"speed_drop_mps": 4.0, # sudden speed drop over short window (m/s)
"speed_drop_window_s": 1.0
}
(WORK / "config.json").write_text(json.dumps(CFG, indent=2))
print("✓ Config written:", WORK / "config.json")
✓ Config written: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\config.json
Discover files per session¶
In [14]:
# list sessions
SESSIONS = [p for p in BASE_PATH.iterdir() if p.is_dir() and p.name.lower().startswith("pvs")]
SESSIONS = sorted(SESSIONS, key=lambda p: p.name)
print("Sessions found:", [s.name for s in SESSIONS])
def find_files(session_path):
files = sorted(glob.glob(str(session_path / "*.csv")))
out = {"accel": None, "gyro": None, "gps": None, "labels": None}
for f in files:
try:
cols = pd.read_csv(f, nrows=0).columns.str.lower().tolist()
except Exception:
continue
cs = set(cols)
if {"ax","ay","az"}.issubset(cs): out["accel"] = f
if {"gx","gy","gz"}.issubset(cs): out["gyro"] = f
if {"lat","lon"}.issubset(cs) or ("speed" in cs): out["gps"] = f
if any(c in cs for c in ["label","labels","class","target","event","accident"]): out["labels"] = f
return out
file_map = {s.name: find_files(s) for s in SESSIONS}
file_map
Sessions found: ['PVS 1', 'PVS 2', 'PVS 3', 'PVS 4', 'PVS 5', 'PVS 6', 'PVS 7', 'PVS 8', 'PVS 9']
Out[14]:
{'PVS 1': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 1\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 2': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 2\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 3': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 3\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 4': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 4\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 5': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 5\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 6': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 6\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 7': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 7\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 8': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 8\\dataset_gps_mpu_right.csv',
'labels': None},
'PVS 9': {'accel': None,
'gyro': None,
'gps': 'C:\\Users\\sn161663\\Desktop\\Accident_Detection_Project_Dataset\\PVS 9\\dataset_gps_mpu_right.csv',
'labels': None}}
Timestamp, filters, resampling, synchronization - Helpers¶
In [15]:
from scipy.signal import butter, filtfilt
def ensure_time_index(df, ts_col="timestamp"):
if ts_col not in df.columns:
raise ValueError(f"timestamp column '{ts_col}' not found.")
ts = df[ts_col].astype("float64")
# seconds vs milliseconds heuristic
if ts.dropna().median() > 1e11:
t = pd.to_datetime(ts, unit="ms", errors="coerce")
else:
t = pd.to_datetime(ts, unit="s", errors="coerce")
out = df.drop(columns=[ts_col]).copy()
out.index = t
out = out[~out.index.duplicated(keep="first")].sort_index()
return out
def butter_lowpass(series, cutoff_hz, fs_hz, order=4):
if series.isna().all():
return series
b, a = butter(order, cutoff_hz/(0.5*fs_hz), btype='low', analog=False)
x = series.interpolate(limit=5).bfill().ffill()
try:
y = filtfilt(b, a, x.values)
return pd.Series(y, index=series.index)
except Exception:
return series
def resample_df(df, fs_hz, agg="mean"):
rule = f"{int(1000/fs_hz)}ms" # 'ms' (milliseconds)
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
return df[num_cols].resample(rule).agg(agg)
def sync_and_filter(session_name, files, cfg=CFG):
parts = []
# Accelerometer
if files["accel"]:
a = pd.read_csv(files["accel"])
a = ensure_time_index(a, "timestamp")
for ax in [c for c in ["ax","ay","az"] if c in a.columns]:
a[ax] = butter_lowpass(a[ax], cfg["lowpass_hz"], cfg["resample_hz"])
a = resample_df(a, cfg["resample_hz"], "mean")
parts.append(a.add_prefix("acc_"))
# Gyroscope
if files["gyro"]:
g = pd.read_csv(files["gyro"])
g = ensure_time_index(g, "timestamp")
for gx in [c for c in ["gx","gy","gz"] if c in g.columns]:
g[gx] = butter_lowpass(g[gx], cfg["lowpass_hz"], cfg["resample_hz"])
g = resample_df(g, cfg["resample_hz"], "mean")
parts.append(g.add_prefix("gyro_"))
# GPS / speed
if files["gps"]:
gps = pd.read_csv(files["gps"])
gps = ensure_time_index(gps, "timestamp")
gps = resample_df(gps, cfg["resample_hz"], "mean")
parts.append(gps.add_prefix("gps_"))
if not parts:
raise RuntimeError(f"No usable sensor files for {session_name}")
df = parts[0].join(parts[1:], how="outer").sort_index()
df = df.interpolate(limit=5).ffill().bfill()
df["session"] = session_name
return df
Proxy Labels¶
In [16]:
def add_proxy_columns(df, cfg=CFG):
# acceleration magnitude (if present)
if all(c in df.columns for c in ["acc_ax","acc_ay","acc_az"]):
df["acc_mag"] = np.sqrt(df["acc_ax"]**2 + df["acc_ay"]**2 + df["acc_az"]**2)
# gyro magnitude (if present)
if all(c in df.columns for c in ["gyro_gx","gyro_gy","gyro_gz"]):
df["gyro_mag"] = np.sqrt(df["gyro_gx"]**2 + df["gyro_gy"]**2 + df["gyro_gz"]**2)
# sudden speed drop (m/s) over short window
if "gps_speed" in df.columns:
k = max(int(cfg["speed_drop_window_s"] * cfg["resample_hz"]), 2)
s = df["gps_speed"].ffill()
df["speed_drop"] = s - s.rolling(k, min_periods=1).min()
else:
df["speed_drop"] = np.nan
conds = []
if "acc_mag" in df.columns:
conds.append(df["acc_mag"] >= cfg["accel_mag_g"] * 9.81) # remove *9.81 if accel already in g
if "gyro_mag" in df.columns:
conds.append(df["gyro_mag"] >= cfg["gyro_mag_dps"])
if df["speed_drop"].notna().any():
conds.append(df["speed_drop"] >= cfg["speed_drop_mps"])
df["proxy_incident"] = 0
if conds:
df.loc[np.logical_or.reduce(conds), "proxy_incident"] = 1
return df
Process each session - save clean, resampled, labeled data¶
In [17]:
PROCESSED = []
for s in SESSIONS:
files = file_map[s.name]
try:
df = sync_and_filter(s.name, files, CFG)
df = add_proxy_columns(df, CFG)
out = CLEAN / f"{s.name}.parquet"
df.to_parquet(out, index=True)
PROCESSED.append({"session": s.name, "rows": len(df), "path": str(out)})
print(f"✓ {s.name}: {len(df):,} rows -> {out.name}")
except Exception as e:
print(f"✗ {s.name}:", e)
pd.DataFrame(PROCESSED).to_csv(WORK/"processed_sessions.csv", index=False)
print("Saved index:", WORK/"processed_sessions.csv")
✓ PVS 1: 72,019 rows -> PVS 1.parquet ✓ PVS 2: 62,343 rows -> PVS 2.parquet ✓ PVS 3: 52,908 rows -> PVS 3.parquet ✓ PVS 4: 66,246 rows -> PVS 4.parquet ✓ PVS 5: 66,939 rows -> PVS 5.parquet ✓ PVS 6: 48,141 rows -> PVS 6.parquet ✓ PVS 7: 64,274 rows -> PVS 7.parquet ✓ PVS 8: 61,810 rows -> PVS 8.parquet ✓ PVS 9: 45,778 rows -> PVS 9.parquet Saved index: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\processed_sessions.csv
In [8]:
test_df = pd.read_parquet(CLEAN / "PVS 1.parquet")
test_df.head(), test_df.columns.tolist()[:20]
Out[8]:
( gps_acc_x_dashboard gps_acc_y_dashboard \
timestamp
2019-12-24 20:19:56.540 0.314897 0.187227
2019-12-24 20:19:56.560 0.297539 0.187227
2019-12-24 20:19:56.580 0.308912 0.199198
2019-12-24 20:19:56.600 0.317292 0.157299
2019-12-24 20:19:56.620 0.295744 0.148919
gps_acc_z_dashboard gps_acc_x_above_suspension \
timestamp
2019-12-24 20:19:56.540 9.863572 0.314750
2019-12-24 20:19:56.560 9.869558 0.313553
2019-12-24 20:19:56.580 9.842024 0.332706
2019-12-24 20:19:56.600 9.859981 0.297991
2019-12-24 20:19:56.620 9.885120 0.266866
gps_acc_y_above_suspension \
timestamp
2019-12-24 20:19:56.540 0.166426
2019-12-24 20:19:56.560 0.154455
2019-12-24 20:19:56.580 0.159244
2019-12-24 20:19:56.600 0.156849
2019-12-24 20:19:56.620 0.156849
gps_acc_z_above_suspension \
timestamp
2019-12-24 20:19:56.540 9.808869
2019-12-24 20:19:56.560 9.855556
2019-12-24 20:19:56.580 9.831614
2019-12-24 20:19:56.600 9.824431
2019-12-24 20:19:56.620 9.835205
gps_acc_x_below_suspension \
timestamp
2019-12-24 20:19:56.540 0.529819
2019-12-24 20:19:56.560 0.525031
2019-12-24 20:19:56.580 0.533411
2019-12-24 20:19:56.600 0.498695
2019-12-24 20:19:56.620 0.496300
gps_acc_y_below_suspension \
timestamp
2019-12-24 20:19:56.540 0.097111
2019-12-24 20:19:56.560 0.100702
2019-12-24 20:19:56.580 0.092323
2019-12-24 20:19:56.600 0.404766
2019-12-24 20:19:56.620 0.097111
gps_acc_z_below_suspension gps_gyro_x_dashboard \
timestamp
2019-12-24 20:19:56.540 9.930623 0.221062
2019-12-24 20:19:56.560 9.948579 0.045586
2019-12-24 20:19:56.580 9.887527 0.175285
2019-12-24 20:19:56.600 9.923440 0.205803
2019-12-24 20:19:56.620 9.856403 -0.206184
... gps_temp_dashboard gps_temp_above_suspension \
timestamp ...
2019-12-24 20:19:56.540 ... 34.274628 34.035014
2019-12-24 20:19:56.560 ... 34.358493 34.082936
2019-12-24 20:19:56.580 ... 34.370474 33.939168
2019-12-24 20:19:56.600 ... 34.514242 33.963129
2019-12-24 20:19:56.620 ... 34.370474 34.082936
gps_temp_below_suspension gps_timestamp_gps \
timestamp
2019-12-24 20:19:56.540 31.926408 1.577219e+09
2019-12-24 20:19:56.560 31.734717 1.577219e+09
2019-12-24 20:19:56.580 31.447180 1.577219e+09
2019-12-24 20:19:56.600 31.447180 1.577219e+09
2019-12-24 20:19:56.620 31.638871 1.577219e+09
gps_latitude gps_longitude gps_speed session \
timestamp
2019-12-24 20:19:56.540 -27.717841 -51.098865 0.009128 PVS 1
2019-12-24 20:19:56.560 -27.717841 -51.098865 0.009128 PVS 1
2019-12-24 20:19:56.580 -27.717841 -51.098865 0.009128 PVS 1
2019-12-24 20:19:56.600 -27.717841 -51.098865 0.009128 PVS 1
2019-12-24 20:19:56.620 -27.717841 -51.098865 0.009128 PVS 1
speed_drop proxy_incident
timestamp
2019-12-24 20:19:56.540 0.0 0
2019-12-24 20:19:56.560 0.0 0
2019-12-24 20:19:56.580 0.0 0
2019-12-24 20:19:56.600 0.0 0
2019-12-24 20:19:56.620 0.0 0
[5 rows x 34 columns],
['gps_acc_x_dashboard',
'gps_acc_y_dashboard',
'gps_acc_z_dashboard',
'gps_acc_x_above_suspension',
'gps_acc_y_above_suspension',
'gps_acc_z_above_suspension',
'gps_acc_x_below_suspension',
'gps_acc_y_below_suspension',
'gps_acc_z_below_suspension',
'gps_gyro_x_dashboard',
'gps_gyro_y_dashboard',
'gps_gyro_z_dashboard',
'gps_gyro_x_above_suspension',
'gps_gyro_y_above_suspension',
'gps_gyro_z_above_suspension',
'gps_gyro_x_below_suspension',
'gps_gyro_y_below_suspension',
'gps_gyro_z_below_suspension',
'gps_mag_x_dashboard',
'gps_mag_y_dashboard'])
Windowing - create training instances (features + label)¶
In [18]:
def window_iter(df, win_s, step_s, fs):
n = len(df); win = int(win_s * fs); step = int(step_s * fs)
i = 0
while i + win <= n:
seg = df.iloc[i:i+win]
yield seg.index[0], seg.index[-1], seg
i += step
def basic_window_features(seg):
feats = {}
cols = [c for c in seg.columns if pd.api.types.is_numeric_dtype(seg[c])]
for c in cols:
x = seg[c].dropna()
if x.empty: continue
feats[f"{c}_mean"] = float(x.mean())
feats[f"{c}_std"] = float(x.std())
feats[f"{c}_min"] = float(x.min())
feats[f"{c}_max"] = float(x.max())
feats[f"{c}_rms"] = float(np.sqrt(np.mean(np.square(x))))
return feats
def build_windows_for_session(session_name, cfg=CFG):
path = CLEAN / f"{session_name}.parquet"
df = pd.read_parquet(path)
fs = cfg["resample_hz"]
win_s = cfg["window_sec"]
step_s = cfg["window_sec"] * (1 - cfg["overlap"])
rows = []
for t0, t1, seg in window_iter(df, win_s, step_s, fs):
feats = basic_window_features(seg)
y = int(seg["proxy_incident"].max()) if "proxy_incident" in seg.columns else 0
rows.append({"session": session_name, "t_start": t0, "t_end": t1, "y": y, **feats})
win_df = pd.DataFrame(rows)
out = WINDOWS / f"{session_name}_windows.parquet"
win_df.to_parquet(out, index=False)
return out, len(win_df), (win_df["y"].mean() if len(win_df) else 0.0)
In [19]:
SUMMARY = []
for s in SESSIONS:
p = CLEAN / f"{s.name}.parquet"
if not p.exists():
print(f"Skipping {s.name} (no cleaned file)")
continue
out, nrows, pos_rate = build_windows_for_session(s.name, CFG)
SUMMARY.append({"session": s.name, "windows": nrows, "positive_rate": round(pos_rate, 4), "path": str(out)})
print(f"✓ Windows {s.name}: {nrows:,} (pos_rate={pos_rate:.4f})")
pd.DataFrame(SUMMARY).to_csv(WORK/"windows_summary.csv", index=False)
print("Saved:", WORK/"windows_summary.csv")
✓ Windows PVS 1: 959 (pos_rate=0.0000) ✓ Windows PVS 2: 830 (pos_rate=0.0024) ✓ Windows PVS 3: 704 (pos_rate=0.0000) ✓ Windows PVS 4: 882 (pos_rate=0.0000) ✓ Windows PVS 5: 891 (pos_rate=0.0022) ✓ Windows PVS 6: 640 (pos_rate=0.0031) ✓ Windows PVS 7: 855 (pos_rate=0.0035) ✓ Windows PVS 8: 823 (pos_rate=0.0036) ✓ Windows PVS 9: 609 (pos_rate=0.0000) Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\windows_summary.csv
Sanity checks on prepared data¶
In [23]:
# Cleaned file check
df_clean = pd.read_parquet(CLEAN / f"{SESSIONS[0].name}.parquet") # first session
print("Clean shape:", df_clean.shape)
have = [c for c in ["acc_mag","gyro_mag","gps_speed","speed_drop","proxy_incident"]
if c in df_clean.columns]
print("Summary columns:", have)
if have: display(df_clean[have].describe())
# Window file check
win_files = sorted(glob.glob(str(WINDOWS/"*_windows.parquet")))
print("Window files:", len(win_files))
if win_files:
w = pd.read_parquet(win_files[0])
print("Windows shape:", w.shape)
if "y" in w.columns:
print("Class balance:", (w["y"].value_counts(normalize=True)
.rename({0:"normal",1:"incident"})))
display(w.head())
Clean shape: (72019, 34) Summary columns: ['gps_speed', 'speed_drop', 'proxy_incident']
| gps_speed | speed_drop | proxy_incident | |
|---|---|---|---|
| count | 72019.000000 | 72019.000000 | 72019.0 |
| mean | 9.556739 | 0.159041 | 0.0 |
| std | 7.746441 | 0.285608 | 0.0 |
| min | 0.002526 | 0.000000 | 0.0 |
| 25% | 4.508887 | 0.000000 | 0.0 |
| 50% | 6.618945 | 0.004228 | 0.0 |
| 75% | 16.647470 | 0.225994 | 0.0 |
| max | 26.874480 | 3.895296 | 0.0 |
Window files: 9 Windows shape: (959, 169) Class balance: y normal 1.0 Name: proportion, dtype: float64
| session | t_start | t_end | y | gps_acc_x_dashboard_mean | gps_acc_x_dashboard_std | gps_acc_x_dashboard_min | gps_acc_x_dashboard_max | gps_acc_x_dashboard_rms | gps_acc_y_dashboard_mean | ... | speed_drop_mean | speed_drop_std | speed_drop_min | speed_drop_max | speed_drop_rms | proxy_incident_mean | proxy_incident_std | proxy_incident_min | proxy_incident_max | proxy_incident_rms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PVS 1 | 2019-12-24 20:19:56.540 | 2019-12-24 20:19:59.520 | 0 | 0.306168 | 0.016296 | 0.270605 | 0.365176 | 0.306598 | 0.165220 | ... | 0.000099 | 0.000143 | 0.0 | 0.000303 | 0.000173 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | PVS 1 | 2019-12-24 20:19:58.040 | 2019-12-24 20:20:01.020 | 0 | 0.307486 | 0.026242 | 0.223918 | 0.374752 | 0.308596 | 0.163978 | ... | 0.000201 | 0.000913 | 0.0 | 0.007933 | 0.000932 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | PVS 1 | 2019-12-24 20:19:59.540 | 2019-12-24 20:20:02.520 | 0 | 0.306282 | 0.030071 | 0.195187 | 0.385526 | 0.307745 | 0.163780 | ... | 0.002592 | 0.003733 | 0.0 | 0.007933 | 0.004534 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | PVS 1 | 2019-12-24 20:20:01.040 | 2019-12-24 20:20:04.020 | 0 | 0.307264 | 0.024270 | 0.195187 | 0.385526 | 0.308215 | 0.164431 | ... | 0.002487 | 0.003691 | 0.0 | 0.007933 | 0.004441 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | PVS 1 | 2019-12-24 20:20:02.540 | 2019-12-24 20:20:05.520 | 0 | 0.306369 | 0.018382 | 0.263422 | 0.367570 | 0.306916 | 0.163311 | ... | 0.000186 | 0.000305 | 0.0 | 0.000827 | 0.000357 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 169 columns
STEP 4: VISUALIZATION & EDA¶
Imports & paths¶
In [26]:
import pathlib, glob, json, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Pretty plots
plt.rcParams["figure.figsize"] = (10, 4)
plt.rcParams["axes.grid"] = True
# === CHANGE THIS TO YOUR FOLDER ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE_PATH / "work"
CLEAN = WORK / "clean_resampled"
WINDOWS = WORK / "windows"
# Load config if present
CFG_PATH = WORK / "config.json"
CFG = json.loads(CFG_PATH.read_text()) if CFG_PATH.exists() else {
"resample_hz": 50, "window_sec": 3.0, "overlap": 0.5
}
# Utility: safe column grab
def cols_exist(df, cols):
return [c for c in cols if c in df.columns]
Inventory¶
In [27]:
proc_idx = pd.read_csv(WORK / "processed_sessions.csv")
win_sum = pd.read_csv(WORK / "windows_summary.csv")
print("Cleaned sessions:", len(proc_idx))
display(proc_idx.head())
print("\nWindow files:", len(win_sum))
display(win_sum.head())
print("\nTotals:")
print(" Total cleaned rows:", f"{proc_idx['rows'].sum():,}")
print(" Total windows:", f"{win_sum['windows'].sum():,}")
print(" Mean positive rate:", round(win_sum['positive_rate'].mean(), 4))
Cleaned sessions: 9
| session | rows | path | |
|---|---|---|---|
| 0 | PVS 1 | 72019 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 1 | PVS 2 | 62343 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 2 | PVS 3 | 52908 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 3 | PVS 4 | 66246 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 4 | PVS 5 | 66939 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
Window files: 9
| session | windows | positive_rate | path | |
|---|---|---|---|---|
| 0 | PVS 1 | 959 | 0.0000 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 1 | PVS 2 | 830 | 0.0024 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 2 | PVS 3 | 704 | 0.0000 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 3 | PVS 4 | 882 | 0.0000 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
| 4 | PVS 5 | 891 | 0.0022 | C:\Users\sn161663\Desktop\Accident_Detection_P... |
Totals: Total cleaned rows: 540,458 Total windows: 7,193 Mean positive rate: 0.0016
Peek at one cleaned session¶
In [28]:
# Choose first session that exists
first_path = CLEAN / (proc_idx.loc[0, "session"] + ".parquet")
df = pd.read_parquet(first_path)
print("Shape:", df.shape)
print("\nColumns:", list(df.columns)[:30], "...\n")
print(df.info())
display(df.head(5))
# Basic stats for commonly-used columns (only those that exist)
want = ["acc_ax","acc_ay","acc_az","gyro_gx","gyro_gy","gyro_gz",
"gps_speed","acc_mag","gyro_mag","speed_drop","proxy_incident"]
have = cols_exist(df, want)
if have:
display(df[have].describe().T)
else:
print("No standard numeric columns found in this session.")
Shape: (72019, 34) Columns: ['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard', 'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension', 'gps_acc_z_above_suspension', 'gps_acc_x_below_suspension', 'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension', 'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard', 'gps_gyro_x_above_suspension', 'gps_gyro_y_above_suspension', 'gps_gyro_z_above_suspension', 'gps_gyro_x_below_suspension', 'gps_gyro_y_below_suspension', 'gps_gyro_z_below_suspension', 'gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard', 'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension', 'gps_mag_z_above_suspension', 'gps_temp_dashboard', 'gps_temp_above_suspension', 'gps_temp_below_suspension', 'gps_timestamp_gps', 'gps_latitude', 'gps_longitude'] ... <class 'pandas.core.frame.DataFrame'> DatetimeIndex: 72019 entries, 2019-12-24 20:19:56.540000 to 2019-12-24 20:43:56.900000 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gps_acc_x_dashboard 72019 non-null float64 1 gps_acc_y_dashboard 72019 non-null float64 2 gps_acc_z_dashboard 72019 non-null float64 3 gps_acc_x_above_suspension 72019 non-null float64 4 gps_acc_y_above_suspension 72019 non-null float64 5 gps_acc_z_above_suspension 72019 non-null float64 6 gps_acc_x_below_suspension 72019 non-null float64 7 gps_acc_y_below_suspension 72019 non-null float64 8 gps_acc_z_below_suspension 72019 non-null float64 9 gps_gyro_x_dashboard 72019 non-null float64 10 gps_gyro_y_dashboard 72019 non-null float64 11 gps_gyro_z_dashboard 72019 non-null float64 12 gps_gyro_x_above_suspension 72019 non-null float64 13 gps_gyro_y_above_suspension 72019 non-null float64 14 gps_gyro_z_above_suspension 72019 non-null float64 15 gps_gyro_x_below_suspension 72019 non-null float64 16 gps_gyro_y_below_suspension 72019 non-null float64 17 gps_gyro_z_below_suspension 72019 non-null float64 18 gps_mag_x_dashboard 72019 non-null float64 19 gps_mag_y_dashboard 72019 non-null float64 20 gps_mag_z_dashboard 72019 non-null float64 21 gps_mag_x_above_suspension 72019 non-null float64 22 gps_mag_y_above_suspension 72019 non-null float64 23 gps_mag_z_above_suspension 72019 non-null float64 24 gps_temp_dashboard 72019 non-null float64 25 gps_temp_above_suspension 72019 non-null float64 26 gps_temp_below_suspension 72019 non-null float64 27 gps_timestamp_gps 72019 non-null float64 28 gps_latitude 72019 non-null float64 29 gps_longitude 72019 non-null float64 30 gps_speed 72019 non-null float64 31 session 72019 non-null object 32 speed_drop 72019 non-null float64 33 proxy_incident 72019 non-null int64 dtypes: float64(32), int64(1), object(1) memory usage: 19.2+ MB None
| gps_acc_x_dashboard | gps_acc_y_dashboard | gps_acc_z_dashboard | gps_acc_x_above_suspension | gps_acc_y_above_suspension | gps_acc_z_above_suspension | gps_acc_x_below_suspension | gps_acc_y_below_suspension | gps_acc_z_below_suspension | gps_gyro_x_dashboard | ... | gps_temp_dashboard | gps_temp_above_suspension | gps_temp_below_suspension | gps_timestamp_gps | gps_latitude | gps_longitude | gps_speed | session | speed_drop | proxy_incident | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| timestamp | |||||||||||||||||||||
| 2019-12-24 20:19:56.540 | 0.314897 | 0.187227 | 9.863572 | 0.314750 | 0.166426 | 9.808869 | 0.529819 | 0.097111 | 9.930623 | 0.221062 | ... | 34.274628 | 34.035014 | 31.926408 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.560 | 0.297539 | 0.187227 | 9.869558 | 0.313553 | 0.154455 | 9.855556 | 0.525031 | 0.100702 | 9.948579 | 0.045586 | ... | 34.358493 | 34.082936 | 31.734717 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.580 | 0.308912 | 0.199198 | 9.842024 | 0.332706 | 0.159244 | 9.831614 | 0.533411 | 0.092323 | 9.887527 | 0.175285 | ... | 34.370474 | 33.939168 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.600 | 0.317292 | 0.157299 | 9.859981 | 0.297991 | 0.156849 | 9.824431 | 0.498695 | 0.404766 | 9.923440 | 0.205803 | ... | 34.514242 | 33.963129 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.620 | 0.295744 | 0.148919 | 9.885120 | 0.266866 | 0.156849 | 9.835205 | 0.496300 | 0.097111 | 9.856403 | -0.206184 | ... | 34.370474 | 34.082936 | 31.638871 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
5 rows × 34 columns
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| gps_speed | 72019.0 | 9.556739 | 7.746441 | 0.002526 | 4.508887 | 6.618945 | 16.647470 | 26.874480 |
| speed_drop | 72019.0 | 0.159041 | 0.285608 | 0.000000 | 0.000000 | 0.004228 | 0.225994 | 3.895296 |
| proxy_incident | 72019.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
Missing values profile (bar chart)¶
In [29]:
na_pct = df.isna().mean().sort_values(ascending=False) * 100
na_top = na_pct[na_pct > 0].head(25)
if not na_top.empty:
plt.figure(figsize=(10, 6))
na_top.sort_values().plot(kind="barh")
plt.title("Top columns with missing values (%)")
plt.xlabel("% missing")
plt.tight_layout(); plt.show()
else:
print("No missing values in this session.")
No missing values in this session.
Distributions¶
In [30]:
# Pick some numeric columns that actually exist
num_candidates = cols_exist(df, ["acc_ax","acc_ay","acc_az",
"gyro_gx","gyro_gy","gyro_gz",
"gps_speed","acc_mag","gyro_mag","speed_drop"])
for c in num_candidates[:6]:
plt.figure()
df[c].dropna().hist(bins=60)
plt.title(f"Distribution: {c}")
plt.xlabel(c); plt.ylabel("Frequency")
plt.show()
# Proxy label distribution
if "proxy_incident" in df.columns:
vc = df["proxy_incident"].value_counts()
print("Proxy incident counts:\n", vc)
vc.plot(kind="bar")
plt.title("Proxy incident distribution (per timestamp)")
plt.xticks(rotation=0); plt.show()
Proxy incident counts: proxy_incident 0 72019 Name: count, dtype: int64
Short time-series preview¶
In [31]:
fs = CFG["resample_hz"]
seg_samples = 60 * fs
seg = df.iloc[:seg_samples].copy()
has_acc = cols_exist(seg, ["acc_ax","acc_ay","acc_az"])
has_gyro = cols_exist(seg, ["gyro_gx","gyro_gy","gyro_gz"])
has_speed= cols_exist(seg, ["gps_speed"])
# Plot accelerometer
if has_acc:
seg[has_acc].plot()
plt.title("Accelerometer (first 60s)")
plt.show()
# Plot gyroscope
if has_gyro:
seg[has_gyro].plot()
plt.title("Gyroscope (first 60s)")
plt.show()
# Plot speed & proxy label markers
if has_speed:
ax = seg["gps_speed"].plot(label="gps_speed")
if "proxy_incident" in seg.columns and seg["proxy_incident"].sum() > 0:
where = seg["proxy_incident"] == 1
ax.scatter(seg.index[where], seg.loc[where,"gps_speed"],
marker="x", s=30, label="proxy_incident")
ax.set_title("GPS speed (first 60s) + proxy events")
ax.legend(); plt.show()
Correlation heatmap¶
In [32]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
# Limit to a manageable number for plotting (top 25 by variance)
if len(num_cols) > 25:
var = df[num_cols].var().sort_values(ascending=False)
num_cols = var.head(25).index.tolist()
corr = df[num_cols].corr()
plt.figure(figsize=(8, 6))
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation (numeric subset)")
plt.colorbar(shrink=0.8)
plt.xticks(range(len(num_cols)), num_cols, rotation=90)
plt.yticks(range(len(num_cols)), num_cols)
plt.tight_layout(); plt.show()
Session-level label rate¶
In [33]:
win_sum = pd.read_csv(WORK / "windows_summary.csv")
win_sum = win_sum.sort_values("positive_rate", ascending=False)
ax = win_sum.plot(x="session", y="positive_rate", kind="bar", legend=False)
ax.set_title("Positive window rate by session")
ax.set_ylabel("Positive rate"); ax.set_xlabel("Session")
plt.xticks(rotation=45, ha="right"); plt.tight_layout(); plt.show()
print("Overall positive rate:",
round((win_sum["positive_rate"]*win_sum["windows"]).sum() / win_sum["windows"].sum(), 4))
Overall positive rate: 0.0017
Windows feature exploration (y=0 vs y=1)¶
In [34]:
# Read a sample of window files (to avoid huge memory)
win_files = sorted(glob.glob(str(WINDOWS / "*_windows.parquet")))
frames = []
for p in win_files[:6]: # adjust if you want more
f = pd.read_parquet(p)
if len(f) > 6000:
f = f.sample(6000, random_state=42)
frames.append(f)
W = pd.concat(frames, ignore_index=True)
print("Windows sample shape:", W.shape)
print(W["y"].value_counts())
# Choose informative features (keep it small)
cands = [c for c in W.columns if any(k in c for k in ["acc_mag","gyro_mag","gps_speed","speed_drop"]) and c.endswith(("_mean","_max","_std","_rms"))]
top = cands[:8] if len(cands) > 8 else cands
print("Using features:", top)
# Boxplots: y=0 vs y=1 for each chosen feature
import math
r = math.ceil(len(top)/2)
fig, axes = plt.subplots(r, 2, figsize=(12, 4*r))
axes = axes.flatten()
for i, c in enumerate(top):
try:
W.boxplot(column=c, by="y", ax=axes[i])
axes[i].set_title(c); axes[i].set_xlabel("y"); axes[i].set_ylabel(c)
except Exception as e:
axes[i].set_visible(False)
plt.suptitle("Feature distributions by class (windows)"); plt.tight_layout(); plt.show()
Windows sample shape: (4906, 169) y 0 4900 1 6 Name: count, dtype: int64 Using features: ['gps_speed_mean', 'gps_speed_std', 'gps_speed_max', 'gps_speed_rms', 'speed_drop_mean', 'speed_drop_std', 'speed_drop_max', 'speed_drop_rms']
Outliner scan (z-score count)¶
In [35]:
def z_outliers(x, z=4.0):
m, s = np.nanmean(x), np.nanstd(x)
if s == 0 or np.isnan(s): return 0
return int(np.sum(np.abs((x - m)/s) > z))
if num_cols:
outlier_counts = {c: z_outliers(df[c].values) for c in num_cols}
outlier_df = (pd.Series(outlier_counts)
.sort_values(ascending=False)
.head(15)
.rename("outliers (>4σ)"))
display(outlier_df)
else:
print("No numeric columns for outlier scan.")
gps_gyro_z_dashboard 1284 gps_gyro_z_above_suspension 1261 gps_acc_y_below_suspension 408 gps_acc_z_below_suspension 368 gps_gyro_z_below_suspension 354 gps_gyro_x_above_suspension 341 gps_gyro_x_below_suspension 294 gps_acc_x_below_suspension 279 gps_gyro_y_above_suspension 257 gps_gyro_x_dashboard 256 gps_gyro_y_dashboard 252 gps_gyro_y_below_suspension 227 gps_acc_y_dashboard 217 gps_acc_z_dashboard 209 gps_acc_z_above_suspension 178 Name: outliers (>4σ), dtype: int64
Save a compact EDA report¶
In [36]:
REPORTS = WORK / "eda_reports"
REPORTS.mkdir(exist_ok=True)
# 1) Column summary (cleaned session)
col_summary = pd.DataFrame({
"dtype": df.dtypes.astype(str),
"missing_pct": df.isna().mean()*100,
"nonzero_pct": (df.fillna(0) != 0).mean()*100,
})
col_summary.to_csv(REPORTS / "cleaned_column_summary.csv")
# 2) Windows class summary
win_class = W["y"].value_counts(normalize=True).rename_axis("y").rename("pct").reset_index()
win_class.to_csv(REPORTS / "windows_class_distribution.csv", index=False)
# 3) Per-session positive rate (already exists but replicate)
win_sum.to_csv(REPORTS / "per_session_positive_rate.csv", index=False)
print("Saved EDA CSVs to:", REPORTS)
Saved EDA CSVs to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports
EDA-Map: Quick GPS scapper map¶
In [37]:
try:
import folium
gps_cols = cols_exist(df, ["gps_latitude","gps_longitude"]) or cols_exist(df, ["latitude","longitude"])
if gps_cols:
lat_col, lon_col = gps_cols[0], gps_cols[1]
gps = df[[lat_col, lon_col]].dropna().sample(min(3000, len(df)), random_state=42)
center = [gps[lat_col].mean(), gps[lon_col].mean()]
m = folium.Map(location=center, zoom_start=13)
for _, r in gps.iterrows():
folium.CircleMarker([r[lat_col], r[lon_col]], radius=1).add_to(m)
display(m)
else:
print("No GPS columns found for mapping.")
except Exception as e:
print("Map skipped:", e)
Make this Notebook Trusted to load map: File -> Trust Notebook
STEP 5: MODELING¶
Setup & imports¶
In [39]:
# Core
import pathlib, glob, json, os, gc, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
# ML
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedGroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
roc_auc_score, average_precision_score,
confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_class_weight
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
# Plotting
import matplotlib.pyplot as plt
# === CHANGE THIS TO YOUR PROJECT ROOT ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE_PATH / "work"
WINDOWS = WORK / "windows"
MODEL_DIR= WORK / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
# Load config (for reproducibility notes)
CFG = json.loads((WORK/"config.json").read_text()) if (WORK/"config.json").exists() else {}
print("Models will be saved to:", MODEL_DIR)
Models will be saved to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\models
Load & merge window datasets¶
In [40]:
win_files = sorted(glob.glob(str(WINDOWS / "*_windows.parquet")))
assert win_files, "No window files found. Run Step 3 windowing first."
frames = []
for p in win_files:
f = pd.read_parquet(p)
frames.append(f)
W = pd.concat(frames, ignore_index=True)
print("Windows shape:", W.shape)
print("Class balance:", W["y"].value_counts(normalize=True).rename({0:"normal",1:"incident"}))
print("Sessions:", W["session"].nunique())
display(W.head(3))
Windows shape: (7193, 169) Class balance: y normal 0.998332 incident 0.001668 Name: proportion, dtype: float64 Sessions: 9
| session | t_start | t_end | y | gps_acc_x_dashboard_mean | gps_acc_x_dashboard_std | gps_acc_x_dashboard_min | gps_acc_x_dashboard_max | gps_acc_x_dashboard_rms | gps_acc_y_dashboard_mean | ... | speed_drop_mean | speed_drop_std | speed_drop_min | speed_drop_max | speed_drop_rms | proxy_incident_mean | proxy_incident_std | proxy_incident_min | proxy_incident_max | proxy_incident_rms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PVS 1 | 2019-12-24 20:19:56.540 | 2019-12-24 20:19:59.520 | 0 | 0.306168 | 0.016296 | 0.270605 | 0.365176 | 0.306598 | 0.165220 | ... | 0.000099 | 0.000143 | 0.0 | 0.000303 | 0.000173 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | PVS 1 | 2019-12-24 20:19:58.040 | 2019-12-24 20:20:01.020 | 0 | 0.307486 | 0.026242 | 0.223918 | 0.374752 | 0.308596 | 0.163978 | ... | 0.000201 | 0.000913 | 0.0 | 0.007933 | 0.000932 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | PVS 1 | 2019-12-24 20:19:59.540 | 2019-12-24 20:20:02.520 | 0 | 0.306282 | 0.030071 | 0.195187 | 0.385526 | 0.307745 | 0.163780 | ... | 0.002592 | 0.003733 | 0.0 | 0.007933 | 0.004534 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 rows × 169 columns
Split Train /Validation/ Test by session¶
In [41]:
rng = np.random.RandomState(42)
# Unique sessions
sessions = W["session"].unique()
rng.shuffle(sessions)
# 70/15/15 split by session count
n = len(sessions)
n_train = int(0.70*n)
n_valid = int(0.15*n)
train_s = set(sessions[:n_train])
valid_s = set(sessions[n_train:n_train+n_valid])
test_s = set(sessions[n_train+n_valid:])
def mask(sset): return W["session"].isin(sset)
train_df = W[mask(train_s)].reset_index(drop=True)
valid_df = W[mask(valid_s)].reset_index(drop=True)
test_df = W[mask(test_s)].reset_index(drop=True)
print("Sessions -> train/valid/test:", len(train_s), len(valid_s), len(test_s))
print("Rows ->", len(train_df), len(valid_df), len(test_df))
# Build feature list (numeric only, drop obvious non-features)
drop_cols = {"session","t_start","t_end","y"}
feat_cols = [c for c in W.columns
if c not in drop_cols and pd.api.types.is_numeric_dtype(W[c])]
X_train, y_train, g_train = train_df[feat_cols], train_df["y"], train_df["session"]
X_valid, y_valid, g_valid = valid_df[feat_cols], valid_df["y"], valid_df["session"]
X_test, y_test, g_test = test_df [feat_cols], test_df ["y"], test_df ["session"]
print("Feature count:", len(feat_cols))
Sessions -> train/valid/test: 6 1 2 Rows -> 4565 891 1737 Feature count: 165
Class weights & evaluation helper¶
In [42]:
# Class weights from training set
classes = np.array([0,1])
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(k): float(v) for k, v in zip(classes, cw)}
print("Class weights:", class_weight)
def evaluate(model, X, y, title=""):
proba_ok = hasattr(model, "predict_proba")
if proba_ok:
y_proba = model.predict_proba(X)[:,1]
else:
# decision_function fallback (SVM)
if hasattr(model, "decision_function"):
s = model.decision_function(X)
y_proba = (s - s.min()) / (s.max()-s.min() + 1e-9)
else:
y_proba = None
y_hat = model.predict(X)
acc = accuracy_score(y, y_hat)
p, r, f1, _ = precision_recall_fscore_support(y, y_hat, average="binary", zero_division=0)
ap = average_precision_score(y, y_proba) if y_proba is not None else np.nan
auc = roc_auc_score(y, y_proba) if y_proba is not None else np.nan
print(f"\n== {title} ==")
print(f"Accuracy: {acc:.4f} | Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
print(f"ROC-AUC: {auc:.4f} | PR-AUC (AP): {ap:.4f}")
# Confusion matrix
cm = confusion_matrix(y, y_hat, labels=[0,1])
fig, ax = plt.subplots(figsize=(3.5,3))
im = ax.imshow(cm, cmap="Blues")
ax.set_title(f"Confusion Matrix: {title}")
ax.set_xticks([0,1]); ax.set_yticks([0,1])
ax.set_xticklabels(["0","1"]); ax.set_yticklabels(["0","1"])
for (i,j), v in np.ndenumerate(cm):
ax.text(j, i, str(v), ha="center", va="center", color="black")
ax.set_xlabel("Predicted"); ax.set_ylabel("True")
plt.colorbar(im, fraction=0.046, pad=0.04); plt.tight_layout(); plt.show()
return {"acc":acc, "prec":p, "rec":r, "f1":f1, "auc":auc, "ap":ap}
Class weights: {0: 0.5007678806494076, 1: 326.07142857142856}
Preprocessing pipeline¶
In [43]:
base_steps = [
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler(with_mean=True, with_std=True))
]
Train baseline models¶
In [44]:
results = {}
# 1) Logistic Regression (liblinear for small/imbalanced)
lr = Pipeline(base_steps + [
("clf", LogisticRegression(max_iter=500, class_weight=class_weight, solver="liblinear"))
])
lr.fit(X_train, y_train)
results["LogReg_valid"] = evaluate(lr, X_valid, y_valid, "LogReg (valid)")
# 2) Random Forest
rf = Pipeline(base_steps + [
("clf", RandomForestClassifier(n_estimators=300, max_depth=None,
class_weight=class_weight, random_state=42, n_jobs=-1))
])
rf.fit(X_train, y_train)
results["RF_valid"] = evaluate(rf, X_valid, y_valid, "RandomForest (valid)")
# 3) Gradient Boosting
gb = Pipeline(base_steps + [
("clf", GradientBoostingClassifier(random_state=42))
])
gb.fit(X_train, y_train)
results["GB_valid"] = evaluate(gb, X_valid, y_valid, "GradientBoosting (valid)")
# 4) SVM (RBF)
svm = Pipeline(base_steps + [
("clf", SVC(kernel="rbf", C=1.0, gamma="scale", class_weight=class_weight, probability=True, random_state=42))
])
svm.fit(X_train, y_train)
results["SVM_valid"] = evaluate(svm, X_valid, y_valid, "SVM RBF (valid)")
pd.DataFrame(results).T
== LogReg (valid) == Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
== RandomForest (valid) == Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
== GradientBoosting (valid) == Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
== SVM RBF (valid) == Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
Out[44]:
| acc | prec | rec | f1 | auc | ap | |
|---|---|---|---|---|---|---|
| LogReg_valid | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| RF_valid | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| GB_valid | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| SVM_valid | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
Light hyper-parameter tuning¶
In [45]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
def gridsearch(pipe, grid, name):
gs = GridSearchCV(pipe, grid, scoring="average_precision", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train, y_train, groups=g_train)
print(f"\n[{name}] best AP (cv): {gs.best_score_:.4f}")
print("Best params:", gs.best_params_)
# Evaluate on validation
best = gs.best_estimator_
evaluate(best, X_valid, y_valid, f"{name} (VALID)")
return best
# Logistic Regression grid
lr_grid = {
"clf__C": [0.1, 1.0, 3.0],
"clf__penalty": ["l1","l2"],
"clf__solver": ["liblinear"]
}
lr_best = gridsearch(
Pipeline(base_steps + [("clf", LogisticRegression(max_iter=800, class_weight=class_weight))]),
lr_grid, "LogReg"
)
# Random Forest grid
rf_grid = {
"clf__n_estimators": [300, 600],
"clf__max_depth": [None, 12, 18],
"clf__min_samples_leaf": [1, 3]
}
rf_best = gridsearch(
Pipeline(base_steps + [("clf", RandomForestClassifier(class_weight=class_weight, random_state=42, n_jobs=-1))]),
rf_grid, "RandomForest"
)
# Gradient Boosting grid
gb_grid = {
"clf__n_estimators": [150, 300],
"clf__learning_rate": [0.05, 0.1],
"clf__max_depth": [2, 3]
}
gb_best = gridsearch(
Pipeline(base_steps + [("clf", GradientBoostingClassifier(random_state=42))]),
gb_grid, "GradientBoosting"
)
# SVM grid (RBF)
svm_grid = {
"clf__C": [0.5, 1.0, 2.0],
"clf__gamma": ["scale", 0.1, 0.01]
}
svm_best = gridsearch(
Pipeline(base_steps + [("clf", SVC(kernel="rbf", class_weight=class_weight, probability=True, random_state=42))]),
svm_grid, "SVM RBF"
)
[LogReg] best AP (cv): 0.6000
Best params: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
== LogReg (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
[RandomForest] best AP (cv): 0.6000
Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 300}
== RandomForest (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
[GradientBoosting] best AP (cv): 0.6000
Best params: {'clf__learning_rate': 0.05, 'clf__max_depth': 2, 'clf__n_estimators': 150}
== GradientBoosting (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
[SVM RBF] best AP (cv): 0.6000
Best params: {'clf__C': 0.5, 'clf__gamma': 'scale'}
== SVM RBF (VALID) ==
Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
Train + Valid¶
In [46]:
# Manually pick based on your VALID metrics (edit this line)
best_model = rf_best # e.g., rf_best / gb_best / lr_best / svm_best
# Refit on Train+Valid
X_trv = pd.concat([X_train, X_valid], axis=0)
y_trv = pd.concat([y_train, y_valid], axis=0)
best_model.fit(X_trv, y_trv)
# Final test evaluation
test_metrics = evaluate(best_model, X_test, y_test, "FINAL (TEST)")
test_metrics
== FINAL (TEST) == Accuracy: 0.9983 | Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 ROC-AUC: 1.0000 | PR-AUC (AP): 1.0000
Out[46]:
{'acc': 0.998272884283247,
'prec': 0.0,
'rec': 0.0,
'f1': 0.0,
'auc': np.float64(1.0),
'ap': np.float64(1.0)}
Save model & metadata¶
In [47]:
import joblib, time
stamp = time.strftime("%Y%m%d_%H%M%S")
model_path = MODEL_DIR / f"best_model_{stamp}.joblib"
joblib.dump(best_model, model_path)
meta = {
"created": stamp,
"features": feat_cols,
"config": CFG,
"train_sessions": sorted(list(set(g_train))),
"valid_sessions": sorted(list(set(g_valid))),
"test_sessions": sorted(list(set(g_test))),
"test_metrics": test_metrics,
"class_weight": class_weight
}
json.dump(meta, open(MODEL_DIR / f"best_model_{stamp}.json", "w"), indent=2)
print("Saved:", model_path)
Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\models\best_model_20251110_180118.joblib
To Show how to load and predict on a new batch of windows¶
In [48]:
# Load
best_model = joblib.load(model_path)
# Example: predict on test set (or any new window table with same features)
proba = best_model.predict_proba(X_test)[:,1]
yhat = (proba >= 0.5).astype(int)
print("Sample predictions:", yhat[:10])
print("Sample probabilities:", np.round(proba[:10], 3))
Sample predictions: [0 0 0 0 0 0 0 0 0 0] Sample probabilities: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
STEP 6: EVALUATION¶
Reloding the model and testing the split¶
In [49]:
import pathlib, json, joblib, pandas as pd
from sklearn.metrics import confusion_matrix
# === CHANGE to your project root if needed ===
BASE_PATH = pathlib.Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE_PATH / "work"
MODEL_DIR = WORK / "models"
WINDOWS = WORK / "windows"
# Load the latest saved model & metadata
model_path = sorted(MODEL_DIR.glob("best_model_*.joblib"))[-1]
meta_path = model_path.with_suffix(".json")
best_model = joblib.load(model_path)
meta = json.load(open(meta_path))
feat_cols = meta["features"]
test_sessions = meta["test_sessions"]
# Rebuild test set from window files
import glob
frames = []
for p in sorted(glob.glob(str(WINDOWS/"*_windows.parquet"))):
df = pd.read_parquet(p)
if df["session"].iloc[0] in test_sessions:
frames.append(df)
test_df = pd.concat(frames, ignore_index=True)
X_test = test_df[feat_cols]
y_test = test_df["y"]
g_test = test_df["session"]
print("Loaded model:", model_path.name, "| Test rows:", len(test_df))
Loaded model: best_model_20251110_180118.joblib | Test rows: 1737
For computing metrics + confusion matrix¶
In [50]:
import numpy as np, matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
roc_auc_score, average_precision_score,
ConfusionMatrixDisplay)
def evaluate(model, X, y, title=""):
proba = (model.predict_proba(X)[:,1]
if hasattr(model, "predict_proba")
else model.decision_function(X))
# Ensure [0,1]
if proba.min() < 0 or proba.max() > 1:
proba = (proba - proba.min())/(proba.max()-proba.min() + 1e-9)
yhat = (proba >= 0.5).astype(int)
acc = accuracy_score(y, yhat)
p, r, f1, _ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
auc = roc_auc_score(y, proba)
ap = average_precision_score(y, proba)
print(f"\n== {title} ==")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
print(f"ROC-AUC : {auc:.4f} | PR-AUC (AP): {ap:.4f}")
ConfusionMatrixDisplay.from_predictions(y, yhat, labels=[0,1], cmap="Blues")
plt.title(f"Confusion Matrix – {title} (thr=0.50)")
plt.tight_layout(); plt.show()
return dict(acc=acc, prec=p, rec=r, f1=f1, auc=auc, ap=ap, proba=proba, yhat=yhat)
Global metrics on Valid and Test¶
In [51]:
metrics_test = evaluate(best_model, X_test, y_test, "TEST")
== TEST == Accuracy : 0.9983 Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 ROC-AUC : 1.0000 | PR-AUC (AP): 1.0000
curves (ROC & Precision-Recall) on TEST¶
In [52]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc
proba = metrics_test["proba"]
fpr, tpr, _ = roc_curve(y_test, proba)
prec, rec, _ = precision_recall_curve(y_test, proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, lw=2)
plt.plot([0,1],[0,1],'--',lw=1, color='grey')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC={roc_auc:.3f}) – TEST"); plt.grid(True); plt.tight_layout(); plt.show()
plt.figure(figsize=(5,4))
plt.plot(rec, prec, lw=2)
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title(f"Precision–Recall (AP={metrics_test['ap']:.3f}) – TEST")
plt.grid(True); plt.tight_layout(); plt.show()
Threshold tuning (optimize F1 or Recall>=X)¶
In [53]:
def find_best_threshold(y, proba, min_recall=None):
prec, rec, thr = precision_recall_curve(y, proba)
# Skip last duplicate threshold
thr = np.r_[thr, 1.0]
f1 = 2*prec*rec/(prec+rec+1e-9)
if min_recall is not None:
ok = rec >= min_recall
if ok.any():
i = np.argmax(f1 * ok) # best F1 among those meeting recall
else:
i = np.argmax(f1) # fallback
else:
i = np.argmax(f1)
return float(thr[i]), float(prec[i]), float(rec[i]), float(f1[i])
best_thr, p_at, r_at, f1_at = find_best_threshold(y_test, proba, min_recall=0.85) # change or set to None
print(f"Chosen threshold: {best_thr:.3f} | Prec={p_at:.3f} Recall={r_at:.3f} F1={f1_at:.3f}")
yhat_tuned = (proba >= best_thr).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test, yhat_tuned, labels=[0,1], cmap="Blues")
plt.title(f"Confusion Matrix – TEST (thr={best_thr:.2f})"); plt.tight_layout(); plt.show()
Chosen threshold: 0.143 | Prec=1.000 Recall=1.000 F1=1.000
Per-session metrics (reliability by trip)¶
In [54]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
df_eval = pd.DataFrame({"session": g_test, "y": y_test, "proba": proba})
df_eval["yhat"] = (df_eval["proba"] >= best_thr).astype(int)
rows = []
for s, grp in df_eval.groupby("session"):
p, r, f1, _ = precision_recall_fscore_support(grp["y"], grp["yhat"], average="binary", zero_division=0)
acc = accuracy_score(grp["y"], grp["yhat"])
rows.append({"session": s, "n": len(grp), "acc":acc, "prec":p, "rec":r, "f1":f1})
per_session = pd.DataFrame(rows).sort_values("f1", ascending=False)
display(per_session.head(10))
ax = per_session.plot(x="session", y=["f1","rec","prec"], kind="bar", figsize=(10,4))
ax.set_title("Per-session metrics (TEST)"); ax.set_ylabel("Score"); plt.xticks(rotation=45, ha="right")
plt.tight_layout(); plt.show()
| session | n | acc | prec | rec | f1 | |
|---|---|---|---|---|---|---|
| 1 | PVS 7 | 855 | 1.0 | 1.0 | 1.0 | 1.0 |
| 0 | PVS 4 | 882 | 1.0 | 0.0 | 0.0 | 0.0 |
Calibration check (reliability curve & brier score)¶
In [55]:
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
prob_true, prob_pred = calibration_curve(y_test, proba, n_bins=10, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0,1],[0,1],'--', color='grey')
plt.xlabel("Predicted probability"); plt.ylabel("Observed frequency")
plt.title("Reliability curve – TEST"); plt.grid(True); plt.tight_layout(); plt.show()
print("Brier score (lower is better):", round(brier_score_loss(y_test, proba), 4))
Brier score (lower is better): 0.0007
Feature Importance¶
In [56]:
import numpy as np
import matplotlib.pyplot as plt
def plot_importance(model, feature_names, top_k=20, title="Feature importance"):
if hasattr(model.named_steps["clf"], "feature_importances_"):
imp = model.named_steps["clf"].feature_importances_
names = np.array(feature_names)
idx = np.argsort(imp)[::-1][:top_k]
plt.figure(figsize=(8,6))
plt.barh(range(len(idx)), imp[idx][::-1])
plt.yticks(range(len(idx)), names[idx][::-1], fontsize=9)
plt.title(title); plt.tight_layout(); plt.show()
return pd.DataFrame({"feature": names[idx], "importance": imp[idx]})
else:
print("Tree-based importances not available; consider permutation importance below.")
imp_df = plot_importance(best_model, feat_cols, top_k=20, title="Top features (tree model)")
In [58]:
from sklearn.inspection import permutation_importance
pi = permutation_importance(best_model, X_test, y_test, n_repeats=5, random_state=42, scoring="average_precision")
idx = np.argsort(pi.importances_mean)[::-1][:20]
plt.figure(figsize=(8,6))
plt.barh(range(len(idx)), pi.importances_mean[idx][::-1])
plt.yticks(range(len(idx)), np.array(feat_cols)[idx][::-1], fontsize=9)
plt.title("Permutation importance – TEST"); plt.tight_layout(); plt.show()
Error Analysis - list top FNs & FPs¶
In [59]:
errs = pd.DataFrame({"session": g_test, "y": y_test, "proba": proba})
errs["yhat_thr"] = (errs["proba"] >= best_thr).astype(int)
fn = errs[(errs["y"]==1) & (errs["yhat_thr"]==0)].sort_values("proba") # missed incidents
fp = errs[(errs["y"]==0) & (errs["yhat_thr"]==1)].sort_values("proba", ascending=False) # false alarms
print("Top 10 False Negatives (worst misses):")
display(fn.head(10))
print("Top 10 False Positives (high-confidence false alarms):")
display(fp.head(10))
# Save for review
REPORT_DIR = WORK / "eval_reports"; REPORT_DIR.mkdir(exist_ok=True)
fn.head(200).to_csv(REPORT_DIR/"false_negatives.csv", index=False)
fp.head(200).to_csv(REPORT_DIR/"false_positives.csv", index=False)
print("Saved:", REPORT_DIR)
Top 10 False Negatives (worst misses):
| session | y | proba | yhat_thr |
|---|
Top 10 False Positives (high-confidence false alarms):
| session | y | proba | yhat_thr |
|---|
Saved: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eval_reports
Export a compact evaluation report (CSV + JSON)¶
In [60]:
import json, time, pandas as pd, numpy as np
stamp = time.strftime("%Y%m%d_%H%M%S")
summary = {
"timestamp": stamp,
"model_file": model_path.name if "model_path" in globals() else "in-memory",
"thr": best_thr,
"test_metrics_default_thr": {k: float(v) for k,v in metrics_test.items() if k not in ("proba","yhat")},
"test_metrics_tuned_thr": {
"acc": float((errs["y"]==(errs["proba"]>=best_thr).astype(int)).mean()),
"prec": float((fp.shape[0]==0 and 0.0) or (1 - fp.shape[0]/max(1, (errs["yhat_thr"]==1).sum()))), # quick approx; formal calc below
}
}
# Formal precision/recall/F1 at tuned threshold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
yhat_tuned = (proba >= best_thr).astype(int)
p, r, f1, _ = precision_recall_fscore_support(y_test, yhat_tuned, average="binary", zero_division=0)
summary["test_metrics_tuned_thr"].update({
"precision": float(p), "recall": float(r), "f1": float(f1)
})
# Save
report_dir = WORK/"eval_reports"; report_dir.mkdir(exist_ok=True)
json.dump(summary, open(report_dir/f"evaluation_{stamp}.json","w"), indent=2)
# Per-session table too
per_session.to_csv(report_dir/f"per_session_{stamp}.csv", index=False)
print("Saved reports to:", report_dir)
Saved reports to: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eval_reports
Testing Purpose¶
Assignment 2 Testing¶
In [1]:
# Pick the numeric columns you want to report in Assignment 2
num_cols = ['gps_speed', 'speed_drop'] # add others you used, e.g., 'acc_x', 'acc_y'...
# Compute mean, median, mode, and midrange
summary = {}
for col in num_cols:
s = df[col].dropna()
mean_ = s.mean()
median_ = s.median()
# mode() can return multiple values; take the first if any
mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
midrange_ = (s.max() + s.min()) / 2.0
summary[col] = {
'count': s.shape[0],
'mean': mean_,
'median': median_,
'mode': mode_,
'min': s.min(),
'max': s.max(),
'midrange': midrange_
}
import pandas as pd
ct_df = pd.DataFrame(summary).T
ct_df
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 7 5 summary = {} 6 for col in num_cols: ----> 7 s = df[col].dropna() 8 mean_ = s.mean() 9 median_ = s.median() NameError: name 'df' is not defined
In [2]:
midrange_only = {col: (df[col].max() + df[col].min())/2 for col in num_cols}
pd.Series(midrange_only, name='midrange')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 1 ----> 1 midrange_only = {col: (df[col].max() + df[col].min())/2 for col in num_cols} 2 pd.Series(midrange_only, name='midrange') NameError: name 'df' is not defined
In [3]:
import pandas as pd
# replace with your actual path and file name
df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")
print("Data loaded successfully:", df.shape)
df.head()
Data loaded successfully: (72019, 34)
Out[3]:
| gps_acc_x_dashboard | gps_acc_y_dashboard | gps_acc_z_dashboard | gps_acc_x_above_suspension | gps_acc_y_above_suspension | gps_acc_z_above_suspension | gps_acc_x_below_suspension | gps_acc_y_below_suspension | gps_acc_z_below_suspension | gps_gyro_x_dashboard | ... | gps_temp_dashboard | gps_temp_above_suspension | gps_temp_below_suspension | gps_timestamp_gps | gps_latitude | gps_longitude | gps_speed | session | speed_drop | proxy_incident | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| timestamp | |||||||||||||||||||||
| 2019-12-24 20:19:56.540 | 0.314897 | 0.187227 | 9.863572 | 0.314750 | 0.166426 | 9.808869 | 0.529819 | 0.097111 | 9.930623 | 0.221062 | ... | 34.274628 | 34.035014 | 31.926408 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.560 | 0.297539 | 0.187227 | 9.869558 | 0.313553 | 0.154455 | 9.855556 | 0.525031 | 0.100702 | 9.948579 | 0.045586 | ... | 34.358493 | 34.082936 | 31.734717 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.580 | 0.308912 | 0.199198 | 9.842024 | 0.332706 | 0.159244 | 9.831614 | 0.533411 | 0.092323 | 9.887527 | 0.175285 | ... | 34.370474 | 33.939168 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.600 | 0.317292 | 0.157299 | 9.859981 | 0.297991 | 0.156849 | 9.824431 | 0.498695 | 0.404766 | 9.923440 | 0.205803 | ... | 34.514242 | 33.963129 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.620 | 0.295744 | 0.148919 | 9.885120 | 0.266866 | 0.156849 | 9.835205 | 0.496300 | 0.097111 | 9.856403 | -0.206184 | ... | 34.370474 | 34.082936 | 31.638871 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
5 rows × 34 columns
In [4]:
df.columns
Out[4]:
Index(['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard',
'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension',
'gps_acc_z_above_suspension', 'gps_acc_x_below_suspension',
'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension',
'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard',
'gps_gyro_x_above_suspension', 'gps_gyro_y_above_suspension',
'gps_gyro_z_above_suspension', 'gps_gyro_x_below_suspension',
'gps_gyro_y_below_suspension', 'gps_gyro_z_below_suspension',
'gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard',
'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension',
'gps_mag_z_above_suspension', 'gps_temp_dashboard',
'gps_temp_above_suspension', 'gps_temp_below_suspension',
'gps_timestamp_gps', 'gps_latitude', 'gps_longitude', 'gps_speed',
'session', 'speed_drop', 'proxy_incident'],
dtype='object')
In [5]:
num_cols = ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop']
summary = {}
for col in num_cols:
s = df[col].dropna()
mean_ = s.mean()
median_ = s.median()
mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
midrange_ = (s.max() + s.min()) / 2.0
summary[col] = {
'count': s.shape[0],
'mean': mean_,
'median': median_,
'mode': mode_,
'min': s.min(),
'max': s.max(),
'midrange': midrange_
}
import pandas as pd
ct_df = pd.DataFrame(summary).T
print(ct_df)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key) 3804 try: -> 3805 return self._engine.get_loc(casted_key) 3806 except KeyError as err: File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'acc_mag' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[5], line 5 3 summary = {} 4 for col in num_cols: ----> 5 s = df[col].dropna() 6 mean_ = s.mean() 7 median_ = s.median() File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key) 4100 if self.columns.nlevels > 1: 4101 return self._getitem_multilevel(key) -> 4102 indexer = self.columns.get_loc(key) 4103 if is_integer(indexer): 4104 indexer = [indexer] File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key) 3807 if isinstance(casted_key, slice) or ( 3808 isinstance(casted_key, abc.Iterable) 3809 and any(isinstance(x, slice) for x in casted_key) 3810 ): 3811 raise InvalidIndexError(key) -> 3812 raise KeyError(key) from err 3813 except TypeError: 3814 # If we have a listlike key, _check_indexing_error will raise 3815 # InvalidIndexError. Otherwise we fall through and re-raise 3816 # the TypeError. 3817 self._check_indexing_error(key) KeyError: 'acc_mag'
In [6]:
['gps_acc_x_dashboard', 'gps_acc_y_dashboard', 'gps_acc_z_dashboard',
'gps_acc_x_above_suspension', 'gps_acc_y_above_suspension', 'gps_acc_z_above_suspension',
'gps_acc_x_below_suspension', 'gps_acc_y_below_suspension', 'gps_acc_z_below_suspension',
'gps_gyro_x_dashboard', 'gps_gyro_y_dashboard', 'gps_gyro_z_dashboard',
...
'gps_speed', 'session', 'speed_drop', 'proxy_incident']
Cell In[6], line 5 ... ^ SyntaxError: invalid syntax. Perhaps you forgot a comma?
In [7]:
import numpy as np
# Acceleration magnitude (using dashboard sensor)
df["acc_mag"] = np.sqrt(
df["gps_acc_x_dashboard"]**2 +
df["gps_acc_y_dashboard"]**2 +
df["gps_acc_z_dashboard"]**2
)
# Gyroscope magnitude (using dashboard sensor)
df["gyro_mag"] = np.sqrt(
df["gps_gyro_x_dashboard"]**2 +
df["gps_gyro_y_dashboard"]**2 +
df["gps_gyro_z_dashboard"]**2
)
print("Added new columns:", [c for c in df.columns if "mag" in c])
Added new columns: ['gps_mag_x_dashboard', 'gps_mag_y_dashboard', 'gps_mag_z_dashboard', 'gps_mag_x_above_suspension', 'gps_mag_y_above_suspension', 'gps_mag_z_above_suspension', 'acc_mag', 'gyro_mag']
Central Tendancy¶
In [8]:
num_cols = ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop']
summary = {}
for col in num_cols:
s = df[col].dropna()
mean_ = s.mean()
median_ = s.median()
mode_ = s.mode().iloc[0] if not s.mode().empty else float('nan')
midrange_ = (s.max() + s.min()) / 2.0
summary[col] = {
'count': s.shape[0],
'mean': mean_,
'median': median_,
'mode': mode_,
'min': s.min(),
'max': s.max(),
'midrange': midrange_
}
import pandas as pd
ct_df = pd.DataFrame(summary).T
ct_df
Out[8]:
| count | mean | median | mode | min | max | midrange | |
|---|---|---|---|---|---|---|---|
| gps_speed | 72019.0 | 9.556739 | 6.618945 | 0.005715 | 0.002526 | 26.874480 | 13.438503 |
| acc_mag | 72019.0 | 9.948794 | 9.859017 | 9.792895 | 0.409363 | 27.451342 | 13.930352 |
| gyro_mag | 72019.0 | 6.070509 | 4.775196 | 4.116235 | 0.009907 | 54.310432 | 27.160169 |
| speed_drop | 72019.0 | 0.159041 | 0.004228 | 0.000000 | 0.000000 | 3.895296 | 1.947648 |
In [9]:
import matplotlib.pyplot as plt
for col in num_cols:
s = df[col].dropna()
plt.figure(figsize=(8,4))
plt.hist(s, bins=50, alpha=0.7, edgecolor='black')
plt.axvline(s.mean(), color='red', linestyle='--', linewidth=2, label=f"Mean = {s.mean():.2f}")
plt.axvline(s.median(), color='green', linestyle=':', linewidth=2, label=f"Median = {s.median():.2f}")
plt.title(f"Histogram of {col}")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.legend()
plt.show()
In [10]:
for col in num_cols:
df[col].plot(kind='kde', linewidth=2, figsize=(7,3), title=f"KDE Plot of {col}")
plt.xlabel(col)
plt.show()
In [11]:
df[num_cols].plot.box(figsize=(7,4))
plt.title("Boxplot of Key Sensor Features")
plt.ylabel("Value")
plt.show()
In [3]:
ct_df.to_csv("central_tendency_summary.csv")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 1 ----> 1 ct_df.to_csv("central_tendency_summary.csv") NameError: name 'ct_df' is not defined
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load cleaned dataset (example session)
df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")
# Example: histogram and correlation heatmap
plt.figure(figsize=(8,4))
plt.hist(df['gps_speed'], bins=50, color='skyblue', edgecolor='black')
plt.title("Histogram of Vehicle Speed")
plt.xlabel("Speed (km/h)")
plt.ylabel("Frequency")
plt.show()
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap='coolwarm', center=0)
plt.title("Correlation Heatmap of Sensor Features")
plt.tight_layout()
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[2], line 17 14 plt.show() 16 plt.figure(figsize=(8,6)) ---> 17 sns.heatmap(df.corr(), cmap='coolwarm', center=0) 18 plt.title("Correlation Heatmap of Sensor Features") 19 plt.tight_layout() File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:11049, in DataFrame.corr(self, method, min_periods, numeric_only) 11047 cols = data.columns 11048 idx = cols.copy() > 11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 11051 if method == "pearson": 11052 correl = libalgos.nancorr(mat, minp=min_periods) File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:1993, in DataFrame.to_numpy(self, dtype, copy, na_value) 1991 if dtype is not None: 1992 dtype = np.dtype(dtype) -> 1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 1994 if result.dtype is not dtype: 1995 result = np.asarray(result, dtype=dtype) File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1694, in BlockManager.as_array(self, dtype, copy, na_value) 1692 arr.flags.writeable = False 1693 else: -> 1694 arr = self._interleave(dtype=dtype, na_value=na_value) 1695 # The underlying data was copied within _interleave, so no need 1696 # to further copy if copy=True or setting na_value 1698 if na_value is lib.no_default: File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1753, in BlockManager._interleave(self, dtype, na_value) 1751 else: 1752 arr = blk.get_values(dtype) -> 1753 result[rl.indexer] = arr 1754 itemmask[rl.indexer] = 1 1756 if not itemmask.all(): ValueError: could not convert string to float: 'PVS 1'
<Figure size 800x600 with 0 Axes>
In [2]:
df = pd.read_parquet(DATA)
print("Shape:", df.shape)
display(df.head(5))
# (Optional) show columns
pd.Series(df.columns, name="columns").to_frame().head(25)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 1 ----> 1 df = pd.read_parquet(DATA) 2 print("Shape:", df.shape) 3 display(df.head(5)) NameError: name 'pd' is not defined
In [3]:
# Core libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# I/O paths (edit to your actual file)
DATA = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet"
# ART = r"C:\Users\<you>\Desktop\Accident_Detection_Project_Dataset\report_artifacts" # where images/csvs will be saved
In [4]:
df = pd.read_parquet(DATA)
print("Shape:", df.shape)
display(df.head(5))
# (Optional) show columns
pd.Series(df.columns, name="columns").to_frame().head(25)
Shape: (72019, 34)
| gps_acc_x_dashboard | gps_acc_y_dashboard | gps_acc_z_dashboard | gps_acc_x_above_suspension | gps_acc_y_above_suspension | gps_acc_z_above_suspension | gps_acc_x_below_suspension | gps_acc_y_below_suspension | gps_acc_z_below_suspension | gps_gyro_x_dashboard | ... | gps_temp_dashboard | gps_temp_above_suspension | gps_temp_below_suspension | gps_timestamp_gps | gps_latitude | gps_longitude | gps_speed | session | speed_drop | proxy_incident | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| timestamp | |||||||||||||||||||||
| 2019-12-24 20:19:56.540 | 0.314897 | 0.187227 | 9.863572 | 0.314750 | 0.166426 | 9.808869 | 0.529819 | 0.097111 | 9.930623 | 0.221062 | ... | 34.274628 | 34.035014 | 31.926408 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.560 | 0.297539 | 0.187227 | 9.869558 | 0.313553 | 0.154455 | 9.855556 | 0.525031 | 0.100702 | 9.948579 | 0.045586 | ... | 34.358493 | 34.082936 | 31.734717 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.580 | 0.308912 | 0.199198 | 9.842024 | 0.332706 | 0.159244 | 9.831614 | 0.533411 | 0.092323 | 9.887527 | 0.175285 | ... | 34.370474 | 33.939168 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.600 | 0.317292 | 0.157299 | 9.859981 | 0.297991 | 0.156849 | 9.824431 | 0.498695 | 0.404766 | 9.923440 | 0.205803 | ... | 34.514242 | 33.963129 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.620 | 0.295744 | 0.148919 | 9.885120 | 0.266866 | 0.156849 | 9.835205 | 0.496300 | 0.097111 | 9.856403 | -0.206184 | ... | 34.370474 | 34.082936 | 31.638871 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
5 rows × 34 columns
Out[4]:
| columns | |
|---|---|
| 0 | gps_acc_x_dashboard |
| 1 | gps_acc_y_dashboard |
| 2 | gps_acc_z_dashboard |
| 3 | gps_acc_x_above_suspension |
| 4 | gps_acc_y_above_suspension |
| 5 | gps_acc_z_above_suspension |
| 6 | gps_acc_x_below_suspension |
| 7 | gps_acc_y_below_suspension |
| 8 | gps_acc_z_below_suspension |
| 9 | gps_gyro_x_dashboard |
| 10 | gps_gyro_y_dashboard |
| 11 | gps_gyro_z_dashboard |
| 12 | gps_gyro_x_above_suspension |
| 13 | gps_gyro_y_above_suspension |
| 14 | gps_gyro_z_above_suspension |
| 15 | gps_gyro_x_below_suspension |
| 16 | gps_gyro_y_below_suspension |
| 17 | gps_gyro_z_below_suspension |
| 18 | gps_mag_x_dashboard |
| 19 | gps_mag_y_dashboard |
| 20 | gps_mag_z_dashboard |
| 21 | gps_mag_x_above_suspension |
| 22 | gps_mag_y_above_suspension |
| 23 | gps_mag_z_above_suspension |
| 24 | gps_temp_dashboard |
In [5]:
# --- A2-1b. Engineer common features if missing ---
def ensure_feature(df, name, expr):
if name not in df.columns:
df[name] = expr(df)
ensure_feature(df, "acc_mag",
lambda d: np.sqrt(d.filter(like="gps_acc_").pow(2).sum(axis=1)))
ensure_feature(df, "gyro_mag",
lambda d: np.sqrt(d.filter(like="gps_gyro_").pow(2).sum(axis=1)))
ensure_feature(df, "speed_drop",
lambda d: d["gps_speed"].diff().clip(upper=0).abs().fillna(0))
# target column should exist (rename to your project target if needed)
target = "proxy_incident" # 0/1
assert target in df.columns, "Add/rename your label column to 'proxy_incident'."
Central Tendancy¶
In [7]:
num_cols = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
summary_ct = {}
for col in num_cols:
s = df[col].dropna()
mode_vals = s.mode()
summary_ct[col] = {
"count": int(s.shape[0]),
"mean": float(s.mean()),
"median": float(s.median()),
"mode": float(mode_vals.iloc[0]) if not mode_vals.empty else np.nan,
"min": float(s.min()),
"max": float(s.max()),
"midrange": float((s.max() + s.min())/2.0),
}
ct_df = pd.DataFrame(summary_ct).T.round(4)
display(ct_df)
# (optional) save
#ct_df.to_csv(f"{ART}/A2_central_tendency.csv", index=True)
| count | mean | median | mode | min | max | midrange | |
|---|---|---|---|---|---|---|---|
| gps_speed | 72019.0 | 9.5567 | 6.6189 | 0.0057 | 0.0025 | 26.8745 | 13.4385 |
| acc_mag | 72019.0 | 18.4213 | 17.3682 | 12.9027 | 4.0950 | 71.4404 | 37.7677 |
| gyro_mag | 72019.0 | 27.7976 | 22.8371 | 61.5897 | 0.0895 | 267.2069 | 133.6482 |
| speed_drop | 72019.0 | 0.1590 | 0.0042 | 0.0000 | 0.0000 | 3.8953 | 1.9476 |
Dispersion¶
In [10]:
disp_rows = []
for col in num_cols:
s = df[col].dropna()
q1, q3 = s.quantile(0.25), s.quantile(0.75)
disp_rows.append({
"feature": col,
"range": float(s.max() - s.min()),
"Q1": float(q1),
"Q3": float(q3),
"IQR": float(q3 - q1),
"variance": float(s.var()),
"std_dev": float(s.std())
})
disp_df = pd.DataFrame(disp_rows).set_index("feature").round(4)
display(disp_df)
#disp_df.to_csv(f"{ART}/A2_dispersion.csv")
# Optional supporting boxplot (one figure with four boxes)
plt.figure(figsize=(7,4))
sns.boxplot(data=df[num_cols], orient="h")
plt.title("Dispersion via Boxplots (PVS_1)")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_boxplots.png", dpi=300)
plt.show()
| range | Q1 | Q3 | IQR | variance | std_dev | |
|---|---|---|---|---|---|---|
| feature | ||||||
| gps_speed | 26.8720 | 4.5089 | 16.6475 | 12.1386 | 60.0073 | 7.7464 |
| acc_mag | 67.3453 | 16.1825 | 19.8167 | 3.6342 | 17.3873 | 4.1698 |
| gyro_mag | 267.1173 | 12.7269 | 37.5753 | 24.8484 | 503.6426 | 22.4420 |
| speed_drop | 3.8953 | 0.0000 | 0.2260 | 0.2260 | 0.0816 | 0.2856 |
Skewness¶
In [11]:
skew_tbl = df[num_cols].skew(numeric_only=True).to_frame("skew").round(4)
display(skew_tbl)
#skew_tbl.to_csv(f"{ART}/A2_skewness.csv")
# One supporting histogram/KDE for a key feature
plt.figure(figsize=(6,3.5))
sns.kdeplot(df["gps_speed"].dropna(), fill=True)
plt.title("KDE: gps_speed (PVS_1)")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_kde_speed.png", dpi=300)
plt.show()
| skew | |
|---|---|
| gps_speed | 0.8553 |
| acc_mag | 2.0755 |
| gyro_mag | 1.7439 |
| speed_drop | 3.8867 |
In [12]:
cls_counts = df[target].value_counts().sort_index()
display(cls_counts.to_frame("count"))
plt.figure(figsize=(4.5,3.5))
cls_counts.plot(kind="bar", color=["#4C78A8", "#F58518"])
plt.title("Class Counts (0=No Accident, 1=Accident)")
plt.xlabel("proxy_incident")
plt.ylabel("count")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_class_counts.png", dpi=300)
plt.show()
# Optional % table
(cls_counts / cls_counts.sum()).round(4).to_frame("proportion")
| count | |
|---|---|
| proxy_incident | |
| 0 | 72019 |
Out[12]:
| proportion | |
|---|---|
| proxy_incident | |
| 0 | 1.0 |
Correlation¶
In [13]:
# choose only numeric columns used downstream
corr = df[num_cols].corr(numeric_only=True)
plt.figure(figsize=(5.2,4.5))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap="coolwarm", square=True, cbar_kws={'shrink': .75})
plt.title("Correlation: Selected PVS Features")
plt.tight_layout()
#plt.savefig(f"{ART}/A2_corr_heatmap.png", dpi=300)
plt.show()
In [14]:
# Proxy label distribution
if "proxy_incident" in df.columns:
vc = df["proxy_incident"].value_counts()
print("Proxy incident counts:\n", vc)
vc.plot(kind="bar")
plt.title("Proxy incident distribution (per timestamp)")
plt.xticks(rotation=0); plt.show()
Proxy incident counts: proxy_incident 0 72019 Name: count, dtype: int64
In [16]:
# Libraries
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Paths
DATA = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet"
ART = r"C:\Users\<you>\Desktop\Accident_Detection_Project_Dataset\report_artifacts"
# Load
df = pd.read_parquet(DATA)
# Ensure engineered features exist (same as A2)
def ensure_feature(df, name, expr):
if name not in df.columns:
df[name] = expr(df)
ensure_feature(df, "acc_mag",
lambda d: np.sqrt(d.filter(like="gps_acc_").pow(2).sum(axis=1)))
ensure_feature(df, "gyro_mag",
lambda d: np.sqrt(d.filter(like="gps_gyro_").pow(2).sum(axis=1)))
ensure_feature(df, "speed_drop",
lambda d: d["gps_speed"].diff().clip(upper=0).abs().fillna(0))
target = "proxy_incident" # 0/1 label column
assert target in df.columns
# Cast numeric columns used in plots
num_cols = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
# Small helper to save/show uniformly
def save_fig(path):
plt.tight_layout()
plt.savefig(path, dpi=300)
plt.show()
In [18]:
ts = df.set_index("timestamp").sort_index()
ts_sample = ts[num_cols].resample("1S").mean().iloc[:600] # first 10 minutes
plt.figure(figsize=(9,3.5))
plt.plot(ts_sample.index, ts_sample["gps_speed"])
plt.title("Line Plot: gps_speed over time (first 10 minutes)")
plt.xlabel("Time"); plt.ylabel("Speed (km/h)")
#save_fig(f"{ART}/A3_line_speed.png")
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_11892\439280025.py in ?() ----> 1 ts = df.set_index("timestamp").sort_index() 2 ts_sample = ts[num_cols].resample("1S").mean().iloc[:600] # first 10 minutes 3 plt.figure(figsize=(9,3.5)) 4 plt.plot(ts_sample.index, ts_sample["gps_speed"]) C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py in ?(self, keys, drop, append, inplace, verify_integrity) 6118 if not found: 6119 missing.append(col) 6120 6121 if missing: -> 6122 raise KeyError(f"None of {missing} are in the columns") 6123 6124 if inplace: 6125 frame = self KeyError: "None of ['timestamp'] are in the columns"
Assignment 2 Testing¶
In [1]:
# --- PATHS ---
from pathlib import Path
BASE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE / "work"
CLEAN_DIR = WORK / "clean_resampled"
EDA_DIR = WORK / "eda_reports"
EDA_DIR.mkdir(parents=True, exist_ok=True)
# --- LIBS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
plt.rcParams["figure.dpi"] = 140
# --- HELPERS ---
def save_fig(name):
out = EDA_DIR / f"{name}.png"
plt.tight_layout()
plt.savefig(out, bbox_inches="tight")
print("Saved figure:", out)
def save_csv(df, name):
out = EDA_DIR / f"{name}.csv"
df.to_csv(out, index=False)
print("Saved CSV:", out)
def pick_first_existing(df, candidates):
for c in candidates:
if c in df.columns:
return c
return None
In [2]:
# Load a single cleaned parquet to prototype EDA (PVS 1)
one = CLEAN_DIR / "PVS 1.parquet"
df = pd.read_parquet(one)
print(df.shape)
df.head(3)
(72019, 34)
Out[2]:
| gps_acc_x_dashboard | gps_acc_y_dashboard | gps_acc_z_dashboard | gps_acc_x_above_suspension | gps_acc_y_above_suspension | gps_acc_z_above_suspension | gps_acc_x_below_suspension | gps_acc_y_below_suspension | gps_acc_z_below_suspension | gps_gyro_x_dashboard | ... | gps_temp_dashboard | gps_temp_above_suspension | gps_temp_below_suspension | gps_timestamp_gps | gps_latitude | gps_longitude | gps_speed | session | speed_drop | proxy_incident | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| timestamp | |||||||||||||||||||||
| 2019-12-24 20:19:56.540 | 0.314897 | 0.187227 | 9.863572 | 0.314750 | 0.166426 | 9.808869 | 0.529819 | 0.097111 | 9.930623 | 0.221062 | ... | 34.274628 | 34.035014 | 31.926408 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.560 | 0.297539 | 0.187227 | 9.869558 | 0.313553 | 0.154455 | 9.855556 | 0.525031 | 0.100702 | 9.948579 | 0.045586 | ... | 34.358493 | 34.082936 | 31.734717 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
| 2019-12-24 20:19:56.580 | 0.308912 | 0.199198 | 9.842024 | 0.332706 | 0.159244 | 9.831614 | 0.533411 | 0.092323 | 9.887527 | 0.175285 | ... | 34.370474 | 33.939168 | 31.447180 | 1.577219e+09 | -27.717841 | -51.098865 | 0.009128 | PVS 1 | 0.0 | 0 |
3 rows × 34 columns
In [3]:
# Detect label
label_col = pick_first_existing(df, ["proxy_incident", "label", "target", "y", "class"])
print("Label column:", label_col)
# Candidate numeric features (use what exists)
candidates = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
num_cols = [c for c in candidates if c in df.columns]
print("Numeric features found:", num_cols)
# Drop fully-empty columns if any
df = df.dropna(axis=1, how="all")
Label column: proxy_incident Numeric features found: ['gps_speed', 'speed_drop']
In [4]:
miss = df.isna().sum().reset_index()
miss.columns = ["column", "missing_count"]
miss["missing_pct"] = (miss["missing_count"]/len(df)).round(4)
save_csv(miss, "missing_values")
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\missing_values.csv
In [5]:
summary_rows = []
for col in num_cols:
s = df[col].dropna()
mode_ = s.mode().iloc[0] if not s.mode().empty else np.nan
midrange = (s.min() + s.max()) / 2.0
summary_rows.append({
"feature": col,
"count": int(s.shape[0]),
"mean": s.mean(),
"median": s.median(),
"mode": mode_,
"min": s.min(),
"max": s.max(),
"midrange": midrange
})
summary_df = pd.DataFrame(summary_rows)
save_csv(summary_df, "numeric_summary")
summary_df
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\numeric_summary.csv
Out[5]:
| feature | count | mean | median | mode | min | max | midrange | |
|---|---|---|---|---|---|---|---|---|
| 0 | gps_speed | 72019 | 9.556739 | 6.618945 | 0.005715 | 0.002526 | 26.874480 | 13.438503 |
| 1 | speed_drop | 72019 | 0.159041 | 0.004228 | 0.000000 | 0.000000 | 3.895296 | 1.947648 |
In [6]:
for col in num_cols:
s = df[col].dropna()
fig, ax = plt.subplots(figsize=(6,3))
ax.hist(s, bins=60, alpha=0.7)
ax.axvline(s.mean(), color="r", linestyle="--", label=f"Mean={s.mean():.2f}")
ax.axvline(s.median(), color="g", linestyle="-.", label=f"Median={s.median():.2f}")
ax.set_title(f"Histogram with Mean/Median — {col}")
ax.legend()
save_fig(f"central_tendency_{col}")
plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\central_tendency_gps_speed.png Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\central_tendency_speed_drop.png
In [7]:
disp_rows = []
for col in num_cols:
s = df[col].dropna()
q1, q2, q3 = s.quantile([0.25, 0.50, 0.75])
disp_rows.append({
"feature": col,
"range": s.max() - s.min(),
"q1": q1,
"median(q2)": q2,
"q3": q3,
"iqr": q3 - q1,
"variance": s.var(),
"std": s.std()
})
disp_df = pd.DataFrame(disp_rows)
save_csv(disp_df, "dispersion_table")
disp_df
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\dispersion_table.csv
Out[7]:
| feature | range | q1 | median(q2) | q3 | iqr | variance | std | |
|---|---|---|---|---|---|---|---|---|
| 0 | gps_speed | 26.871954 | 4.508887 | 6.618945 | 16.647470 | 12.138583 | 60.007341 | 7.746441 |
| 1 | speed_drop | 3.895296 | 0.000000 | 0.004228 | 0.225994 | 0.225994 | 0.081572 | 0.285608 |
In [8]:
for col in num_cols:
fig, ax = plt.subplots(1, 2, figsize=(9,3))
sns.boxplot(x=df[col], ax=ax[0])
ax[0].set_title(f"Box — {col}")
sns.violinplot(x=df[col], ax=ax[1])
ax[1].set_title(f"Violin — {col}")
save_fig(f"box_violin_{col}")
plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\box_violin_gps_speed.png Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\box_violin_speed_drop.png
In [9]:
sk_rows = []
for col in num_cols:
s = df[col].dropna()
sk_rows.append({"feature": col, "skewness": s.skew()})
fig, ax = plt.subplots(figsize=(6,3))
sns.kdeplot(s, fill=True, ax=ax)
ax.set_title(f"KDE & Skewness — {col} (skew={s.skew():.2f})")
save_fig(f"skew_kde_{col}")
plt.close(fig)
sk_df = pd.DataFrame(sk_rows)
save_csv(sk_df, "skewness_table")
sk_df
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skew_kde_gps_speed.png Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skew_kde_speed_drop.png Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\skewness_table.csv
Out[9]:
| feature | skewness | |
|---|---|---|
| 0 | gps_speed | 0.855313 |
| 1 | speed_drop | 3.886691 |
Class Imbalance¶
In [10]:
if label_col:
counts = df[label_col].value_counts().rename_axis("class").reset_index(name="count")
counts["pct"] = (counts["count"]/counts["count"].sum()).round(4)
save_csv(counts, "class_counts")
print(counts)
# Bar chart
fig, ax = plt.subplots(figsize=(4,3))
sns.barplot(x="class", y="count", data=counts, ax=ax)
ax.set_title("Class Balance (Counts)")
for i, v in enumerate(counts["count"]):
ax.text(i, v, str(v), ha="center", va="bottom")
save_fig("class_balance")
plt.close(fig)
Saved CSV: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\class_counts.csv class count pct 0 0 72019 1.0 Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\class_balance.png
In [11]:
num_for_corr = df.select_dtypes(include=[np.number]).drop(columns=[label_col], errors="ignore")
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(num_for_corr.corr(), cmap="coolwarm", center=0)
ax.set_title("Correlation Heatmap of Sensor Features")
save_fig("corr_heatmap")
plt.close(fig)
Saved figure: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports\corr_heatmap.png
In [12]:
# If you want bigger sample for EDA: concatenate PVS 1..3
files = [CLEAN_DIR/f"PVS {i}.parquet" for i in [1,2,3] if (CLEAN_DIR/f"PVS {i}.parquet").exists()]
big = pd.concat([pd.read_parquet(p) for p in files], ignore_index=True)
big.shape
# You can rerun sections 2–8 with df = big
Out[12]:
(187270, 34)
In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
feature = "gps_speed" # you can also try 'acc_mag' or 'gyro_mag'
s = df[feature].dropna()
plt.figure(figsize=(7,4))
sns.histplot(s, bins=50, color='skyblue', edgecolor='black', kde=True)
plt.axvline(s.mean(), color='red', linestyle='--', linewidth=2, label=f"Mean = {s.mean():.2f}")
plt.axvline(s.median(), color='green', linestyle=':', linewidth=2, label=f"Median = {s.median():.2f}")
plt.title(f"Histogram of {feature} with Mean and Median")
plt.xlabel(feature)
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()
In [14]:
means = df[["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]].mean()
plt.figure(figsize=(6,4))
means.plot(kind='bar', color=['#5DADE2', '#F5B041', '#58D68D', '#AF7AC5'])
plt.title("Mean Values of Key Sensor Features")
plt.xlabel("Feature")
plt.ylabel("Mean")
plt.tight_layout()
plt.show()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In[14], line 1 ----> 1 means = df[["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]].mean() 2 plt.figure(figsize=(6,4)) 3 means.plot(kind='bar', color=['#5DADE2', '#F5B041', '#58D68D', '#AF7AC5']) File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py:4108, in DataFrame.__getitem__(self, key) 4106 if is_iterator(key): 4107 key = list(key) -> 4108 indexer = self.columns._get_indexer_strict(key, "columns")[1] 4110 # take() does not accept boolean indexers 4111 if getattr(indexer, "dtype", None) == bool: File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:6200, in Index._get_indexer_strict(self, key, axis_name) 6197 else: 6198 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) -> 6200 self._raise_if_missing(keyarr, indexer, axis_name) 6202 keyarr = self.take(indexer) 6203 if isinstance(key, Index): 6204 # GH 42790 - Preserve name from an Index File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\base.py:6252, in Index._raise_if_missing(self, key, indexer, axis_name) 6249 raise KeyError(f"None of [{key}] are in the [{axis_name}]") 6251 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) -> 6252 raise KeyError(f"{not_found} not in index") KeyError: "['acc_mag', 'gyro_mag'] not in index"
In [15]:
plt.figure(figsize=(4,3.5))
df["proxy_incident"].value_counts().sort_index().plot(kind='bar', color=["#3498DB","#E74C3C"], edgecolor='black')
plt.title("Class Distribution: No Accident (0) vs Accident (1)")
plt.xlabel("Class Label")
plt.ylabel("Count")
plt.xticks([0,1], ["No Accident", "Accident"], rotation=0)
plt.tight_layout()
plt.show()
In [16]:
plt.figure(figsize=(4,4))
df["proxy_incident"].value_counts().sort_index().plot.pie(
autopct='%1.1f%%', colors=["#5DADE2","#E74C3C"], labels=["No Accident", "Accident"],
startangle=90, wedgeprops={"edgecolor":"white"}
)
plt.title("Proportion of Accident vs Non-Accident Samples")
plt.ylabel("") # remove y-label
plt.tight_layout()
plt.show()
In [17]:
class_stats = df["proxy_incident"].value_counts().rename_axis("Class").reset_index(name="Count")
class_stats["Percentage"] = (class_stats["Count"]/class_stats["Count"].sum()*100).round(2)
class_stats
Out[17]:
| Class | Count | Percentage | |
|---|---|---|---|
| 0 | 0 | 72019 | 100.0 |
Assignemnt 3 - Graphs¶
In [18]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# ---- paths (match your folder layout) ----
BASE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
WORK = BASE / "work"
CLEAN = WORK / "clean_resampled" # where your PVS 1..9 parquet live
EDA = WORK / "eda_reports" / "visuals"
EDA.mkdir(parents=True, exist_ok=True)
# ---- load one or merge a few sessions (example: PVS 1) ----
df = pd.read_parquet(CLEAN / "PVS 1.parquet")
# label column name used in your work
LABEL = "proxy_incident" # 0: no-accident, 1: accident
# numeric-only view + a safe sample for heavy plots
num_df = df.select_dtypes(include=np.number).copy()
samp = num_df.sample(min(len(num_df), 200_000), random_state=42) # keeps things snappy
# consistent look
sns.set_theme(style="whitegrid")
palette = {"No Accident":"#2E86C1", "Accident":"#E74C3C"}
Line Plot - Time Series¶
In [19]:
plt.figure(figsize=(9,4))
df["gps_speed"].iloc[:20_000].plot(color="#2E86C1")
plt.title("Line Plot – Vehicle Speed Over Time (PVS 1)")
plt.xlabel("Time index"); plt.ylabel("Speed (km/h)")
plt.tight_layout()
plt.savefig(EDA / "A_line_speed.png", dpi=200); plt.show()
Vertical Bar Plot - Class Counts¶
In [20]:
plt.figure(figsize=(4.5,4))
(df[LABEL]
.map({0:"No Accident",1:"Accident"})
.value_counts().reindex(["No Accident","Accident"])
.plot(kind="bar", color=[palette["No Accident"], palette["Accident"]], edgecolor="black"))
plt.title("Bar Plot – Class Distribution")
plt.xlabel("Class"); plt.ylabel("Count"); plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(EDA / "B_bar_vertical_class.png", dpi=200); plt.show()
Horizontal Bar Plot - feature means¶
In [33]:
import numpy as np
# create derived magnitudes if not already in your df
if "acc_mag" not in df.columns:
df["acc_mag"] = np.sqrt(
df.filter(like="gps_acc_").pow(2).sum(axis=1)
)
if "gyro_mag" not in df.columns:
df["gyro_mag"] = np.sqrt(
df.filter(like="gps_gyro_").pow(2).sum(axis=1)
)
# make a numeric-only copy for plotting
num_df = df.select_dtypes(include=np.number)
In [34]:
means = num_df[["gps_speed","acc_mag","gyro_mag","speed_drop"]].mean().sort_values()
plt.figure(figsize=(6,4))
means.plot(kind="barh", color="#5DADE2", edgecolor="black")
plt.title("Horizontal Bar Plot – Mean of Key Sensors")
plt.xlabel("Mean value"); plt.ylabel("Feature")
plt.tight_layout(); plt.show()
Histogram - with KDE overlay¶
In [36]:
features = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
for col in features:
plt.figure(figsize=(6,4))
sns.histplot(df[col].dropna(), bins=50, kde=True, color="#5DADE2", edgecolor="black")
plt.title(f"Histogram + KDE – {col}")
plt.xlabel(col); plt.ylabel("Frequency")
plt.tight_layout(); plt.show()
Pie Chart - Class Proportion¶
In [23]:
plt.figure(figsize=(4.6,4.6))
(df[LABEL].map({0:"No Accident",1:"Accident"})
.value_counts().reindex(["No Accident","Accident"])
.plot.pie(autopct="%.1f%%", colors=[palette["No Accident"], palette["Accident"]],
startangle=90, wedgeprops={"edgecolor":"white"}))
plt.title("Pie – Accident vs Non-Accident")
plt.ylabel("")
plt.tight_layout()
plt.savefig(EDA / "E_pie_class.png", dpi=200); plt.show()
KDE - Density Plot¶
In [24]:
plt.figure(figsize=(6,4))
sns.kdeplot(data=num_df, x="gps_speed", fill=True, alpha=0.35, color="#2E86C1")
plt.title("KDE – Speed Distribution")
plt.xlabel("Speed (km/h)"); plt.ylabel("Density")
plt.tight_layout()
plt.savefig(EDA / "F_kde_speed.png", dpi=200); plt.show()
Area Plot - Cumulative mean¶
In [25]:
plt.figure(figsize=(7,3.8))
num_df["gps_speed"].dropna().rolling(1500).mean().clip(lower=0).plot(kind="area", color="#AED6F1")
plt.title("Area – Rolling Mean Speed (Window=1500)")
plt.xlabel("Time index"); plt.ylabel("Mean speed (km/h)")
plt.tight_layout()
plt.savefig(EDA / "G_area_rolling_mean_speed.png", dpi=200); plt.show()
Box and Whisket Plot¶
In [37]:
import matplotlib.pyplot as plt
import seaborn as sns
# Choose features you want to visualize
features = ["gps_speed", "acc_mag", "gyro_mag", "speed_drop"]
plt.figure(figsize=(8, 5))
sns.boxplot(data=df[features], orient="h", palette="pastel")
plt.title("Box and Whisker Plot – Feature Distributions")
plt.xlabel("Value")
plt.ylabel("Sensor Features")
plt.tight_layout()
plt.show()
In [31]:
# Boxplots: y=0 vs y=1 for each chosen feature
import math
r = math.ceil(len(top)/2)
fig, axes = plt.subplots(r, 2, figsize=(12, 4*r))
axes = axes.flatten()
for i, c in enumerate(top):
try:
W.boxplot(column=c, by="y", ax=axes[i])
axes[i].set_title(c); axes[i].set_xlabel("y"); axes[i].set_ylabel(c)
except Exception as e:
axes[i].set_visible(False)
plt.suptitle("Feature distributions by class (windows)"); plt.tight_layout(); plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[31], line 3 1 # Boxplots: y=0 vs y=1 for each chosen feature 2 import math ----> 3 r = math.ceil(len(top)/2) 4 fig, axes = plt.subplots(r, 2, figsize=(12, 4*r)) 5 axes = axes.flatten() NameError: name 'top' is not defined
Scatter Plot - Relationship¶
In [38]:
plt.figure(figsize=(6, 4))
sns.scatterplot(
data=df,
x="gps_speed",
y="acc_mag",
hue="proxy_incident",
palette={0: "#5DADE2", 1: "#E74C3C"},
alpha=0.6,
s=15
)
plt.title("Scatter Plot – Speed vs Acceleration Magnitude")
plt.xlabel("Speed (km/h)")
plt.ylabel("Acceleration (m/s²)")
plt.legend(title="Class (0=Normal, 1=Accident)")
plt.tight_layout()
plt.show()
In [39]:
plt.figure(figsize=(6, 4))
sns.scatterplot(
data=df,
x="gps_speed",
y="gyro_mag",
hue="proxy_incident",
palette={0: "#5DADE2", 1: "#E74C3C"},
alpha=0.6,
s=15
)
plt.title("Scatter Plot – Speed vs Gyroscope Magnitude")
plt.xlabel("Speed (km/h)")
plt.ylabel("Gyroscope (°/s)")
plt.legend(title="Class (0=Normal, 1=Accident)")
plt.tight_layout()
plt.show()
Hexbin Plot - Dense Scatter Alternative¶
In [42]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
# where to save figures (adjust if you like)
ART = r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\eda_reports"
os.makedirs(ART, exist_ok=True)
# df must already be loaded; if not, uncomment and point to your file:
# df = pd.read_parquet(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\clean_resampled\PVS 1.parquet")
# ---- derived features expected by the plots ----
def pick(df, *candidates):
"""Return first column that exists, else None."""
for c in candidates:
if c in df.columns: return c
return None
# acceleration magnitude (prefer '..._dashboard' axes)
ax = pick(df, "gps_acc_x_dashboard", "gps_acc_x_above_suspension", "gps_acc_x_below_suspension")
ay = pick(df, "gps_acc_y_dashboard", "gps_acc_y_above_suspension", "gps_acc_y_below_suspension")
az = pick(df, "gps_acc_z_dashboard", "gps_acc_z_above_suspension", "gps_acc_z_below_suspension")
if all([ax, ay, az]):
df["acc_mag"] = np.sqrt(df[ax]**2 + df[ay]**2 + df[az]**2)
# gyroscope magnitude (prefer '..._dashboard' axes)
gx = pick(df, "gps_gyro_x_dashboard", "gps_gyro_x_above_suspension", "gps_gyro_x_below_suspension")
gy = pick(df, "gps_gyro_y_dashboard", "gps_gyro_y_above_suspension", "gps_gyro_y_below_suspension")
gz = pick(df, "gps_gyro_z_dashboard", "gps_gyro_z_above_suspension", "gps_gyro_z_below_suspension")
if all([gx, gy, gz]):
df["gyro_mag"] = np.sqrt(df[gx]**2 + df[gy]**2 + df[gz]**2)
# speed_drop (positive when speed decreases)
if "gps_speed" in df.columns:
df["speed_drop"] = (-df["gps_speed"].diff()).clip(lower=0).fillna(0)
# label fallback if needed
label_col = "proxy_incident" if "proxy_incident" in df.columns else None
if label_col is None:
df["proxy_incident"] = 0
label_col = "proxy_incident"
# analysis subset used by the plots
want = [c for c in ["gps_speed","acc_mag","gyro_mag","speed_drop", label_col] if c in df.columns]
seg = df[want].dropna()
print("seg shape:", seg.shape, "cols:", list(seg.columns))
seg shape: (72019, 5) cols: ['gps_speed', 'acc_mag', 'gyro_mag', 'speed_drop', 'proxy_incident']
In [43]:
if {"gps_speed", "gyro_mag"}.issubset(seg.columns):
plt.figure(figsize=(6.2,4.5))
plt.hexbin(seg["gps_speed"], seg["gyro_mag"], gridsize=50, cmap="viridis", mincnt=5)
cb = plt.colorbar(); cb.set_label("Counts")
plt.title("Hexbin: GPS speed vs Gyro magnitude")
plt.xlabel("gps_speed"); plt.ylabel("gyro_mag")
plt.tight_layout()
plt.savefig(os.path.join(ART, "hexbin_speed_gyro.png"), dpi=200)
plt.show()
else:
print("Hexbin skipped: need both 'gps_speed' and 'gyro_mag'. Present:", list(seg.columns))
Correlation Matrix - Heat Map¶
In [44]:
# keep numeric columns only (avoids "could not convert string to float: 'PVS 1'")
num_for_corr = seg.select_dtypes(include=[np.number])
corr = num_for_corr.corr().clip(-1, 1)
plt.figure(figsize=(6.5,5.2))
sns.heatmap(corr, cmap="coolwarm", vmin=-1, vmax=1, cbar_kws={"shrink": .85})
plt.title("Correlation Heatmap of Numeric Features")
plt.tight_layout()
plt.savefig(os.path.join(ART, "heatmap_corr_numeric.png"), dpi=200)
plt.show()
Assignment 4 - Preprocessing¶
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
classification_report,
confusion_matrix,
ConfusionMatrixDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
DATA_PATH = Path("C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset")
LABEL_COL = "label"
Cell In[2], line 30 DATA_PATH = Path("C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset") ^ SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
In [3]:
from pathlib import Path
import pandas as pd
import glob
WINDOWS_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\work\windows")
files = glob.glob(str(WINDOWS_PATH / "*.parquet"))
df_list = [pd.read_parquet(f) for f in files]
df_final = pd.concat(df_list, ignore_index=True)
# Save final combined dataset
OUTPUT_FILE = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
df_final.to_csv(OUTPUT_FILE, index=False)
print("Final dataset saved as:", OUTPUT_FILE)
print("Shape:", df_final.shape)
df_final.head()
Final dataset saved as: C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv Shape: (7193, 169)
Out[3]:
| session | t_start | t_end | y | gps_acc_x_dashboard_mean | gps_acc_x_dashboard_std | gps_acc_x_dashboard_min | gps_acc_x_dashboard_max | gps_acc_x_dashboard_rms | gps_acc_y_dashboard_mean | ... | speed_drop_mean | speed_drop_std | speed_drop_min | speed_drop_max | speed_drop_rms | proxy_incident_mean | proxy_incident_std | proxy_incident_min | proxy_incident_max | proxy_incident_rms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PVS 1 | 2019-12-24 20:19:56.540 | 2019-12-24 20:19:59.520 | 0 | 0.306168 | 0.016296 | 0.270605 | 0.365176 | 0.306598 | 0.165220 | ... | 0.000099 | 0.000143 | 0.0 | 0.000303 | 0.000173 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | PVS 1 | 2019-12-24 20:19:58.040 | 2019-12-24 20:20:01.020 | 0 | 0.307486 | 0.026242 | 0.223918 | 0.374752 | 0.308596 | 0.163978 | ... | 0.000201 | 0.000913 | 0.0 | 0.007933 | 0.000932 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | PVS 1 | 2019-12-24 20:19:59.540 | 2019-12-24 20:20:02.520 | 0 | 0.306282 | 0.030071 | 0.195187 | 0.385526 | 0.307745 | 0.163780 | ... | 0.002592 | 0.003733 | 0.0 | 0.007933 | 0.004534 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | PVS 1 | 2019-12-24 20:20:01.040 | 2019-12-24 20:20:04.020 | 0 | 0.307264 | 0.024270 | 0.195187 | 0.385526 | 0.308215 | 0.164431 | ... | 0.002487 | 0.003691 | 0.0 | 0.007933 | 0.004441 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | PVS 1 | 2019-12-24 20:20:02.540 | 2019-12-24 20:20:05.520 | 0 | 0.306369 | 0.018382 | 0.263422 | 0.367570 | 0.306916 | 0.163311 | ... | 0.000186 | 0.000305 | 0.0 | 0.000827 | 0.000357 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 169 columns
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
classification_report,
confusion_matrix,
ConfusionMatrixDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
DATA_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
LABEL_COL = "label"
Loading the Final Dataset¶
In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
DATA_PATH = Path(r"C:\Users\sn161663\Desktop\Accident_Detection_Project_Dataset\pvs_final.csv")
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()
(7193, 169)
Out[5]:
| session | t_start | t_end | y | gps_acc_x_dashboard_mean | gps_acc_x_dashboard_std | gps_acc_x_dashboard_min | gps_acc_x_dashboard_max | gps_acc_x_dashboard_rms | gps_acc_y_dashboard_mean | ... | speed_drop_mean | speed_drop_std | speed_drop_min | speed_drop_max | speed_drop_rms | proxy_incident_mean | proxy_incident_std | proxy_incident_min | proxy_incident_max | proxy_incident_rms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PVS 1 | 2019-12-24 20:19:56.540 | 2019-12-24 20:19:59.520 | 0 | 0.306168 | 0.016296 | 0.270605 | 0.365176 | 0.306598 | 0.165220 | ... | 0.000099 | 0.000143 | 0.0 | 0.000303 | 0.000173 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | PVS 1 | 2019-12-24 20:19:58.040 | 2019-12-24 20:20:01.020 | 0 | 0.307486 | 0.026242 | 0.223918 | 0.374752 | 0.308596 | 0.163978 | ... | 0.000201 | 0.000913 | 0.0 | 0.007933 | 0.000932 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | PVS 1 | 2019-12-24 20:19:59.540 | 2019-12-24 20:20:02.520 | 0 | 0.306282 | 0.030071 | 0.195187 | 0.385526 | 0.307745 | 0.163780 | ... | 0.002592 | 0.003733 | 0.0 | 0.007933 | 0.004534 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | PVS 1 | 2019-12-24 20:20:01.040 | 2019-12-24 20:20:04.020 | 0 | 0.307264 | 0.024270 | 0.195187 | 0.385526 | 0.308215 | 0.164431 | ... | 0.002487 | 0.003691 | 0.0 | 0.007933 | 0.004441 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | PVS 1 | 2019-12-24 20:20:02.540 | 2019-12-24 20:20:05.520 | 0 | 0.306369 | 0.018382 | 0.263422 | 0.367570 | 0.306916 | 0.163311 | ... | 0.000186 | 0.000305 | 0.0 | 0.000827 | 0.000357 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 169 columns
Identifying and Handle Missing Values¶
In [19]:
df_out = df.copy()
missing = df_out.isna().sum()
print("Columns with missing values:")
print(missing[missing > 0])
numeric_cols_all = df_out.select_dtypes(include=[np.number]).columns
df_out[numeric_cols_all] = df_out[numeric_cols_all].fillna(df_out[numeric_cols_all].median())
print("\nTotal remaining NaNs:", df_out.isna().sum().sum())
Columns with missing values: Series([], dtype: int64) Total remaining NaNs: 0
In [6]:
missing = df.isna().sum()
print(missing[missing > 0])
Series([], dtype: int64)
In [7]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
Outliner Detection & Treatment¶
In [8]:
df_out = df.copy()
for col in numeric_cols:
lower = df_out[col].quantile(0.001)
upper = df_out[col].quantile(0.999)
df_out[col] = df_out[col].clip(lower, upper)
In [20]:
sample_features = [
c for c in df_out.select_dtypes(include=[np.number]).columns
if c not in ["y"]
][:8]
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_out[sample_features])
plt.title("Boxplot of Sample Numeric Features")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
for col in numeric_cols_all:
lower = df_out[col].quantile(0.001)
upper = df_out[col].quantile(0.999)
df_out[col] = df_out[col].clip(lower, upper)
print("Outlier clipping done at 0.1% tails.")
Outlier clipping done at 0.1% tails.
Discretization¶
In [10]:
df_out = df_out.copy()
df_out["speed_bin"] = pd.cut(
df_out["gps_speed_mean"],
bins=[-1, 20, 40, 60, 200],
labels=["0–20", "20–40", "40–60", ">60"]
)
In [21]:
if "gps_speed_mean" in df_out.columns:
df_out = df_out.copy() # defragment
df_out["speed_bin"] = pd.cut(
df_out["gps_speed_mean"],
bins=[-1, 20, 40, 60, 200],
labels=["0–20", "20–40", "40–60", ">60"]
)
print(df_out["speed_bin"].value_counts())
sns.countplot(x="speed_bin", data=df_out)
plt.title("Speed Bin Distribution")
plt.show()
else:
print("gps_speed_mean not found; skipping discretization.")
speed_bin 0–20 5930 20–40 1263 40–60 0 >60 0 Name: count, dtype: int64
Scaling and Normalization¶
In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
numeric_cols = df_out.select_dtypes(include=[np.number]).columns
X = df_out[numeric_cols].drop(columns=["y"])
y = df_out["y"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
std_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)
X_train_mm = mm_scaler.fit_transform(X_train)
X_test_mm = mm_scaler.transform(X_test)
print("X_train_std shape:", X_train_std.shape)
print("X_train_mm shape:", X_train_mm.shape)
X_train_std shape: (5754, 165) X_train_mm shape: (5754, 165)
In [15]:
drop_cols = ["session", "t_start", "t_end"]
X = df_out.drop(columns=drop_cols + ["y"])
y = df_out["y"]
Train/Test Split¶
In [22]:
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
stratify=y,
random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
std_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)
X_train_mm = mm_scaler.fit_transform(X_train)
X_test_mm = mm_scaler.transform(X_test)
print("X_train_std shape:", X_train_std.shape)
print("X_train_mm shape:", X_train_mm.shape)
Train shape: (5754, 165) Test shape: (1439, 165) X_train_std shape: (5754, 165) X_train_mm shape: (5754, 165)
Select Numeric Features, Define X and Y¶
In [23]:
LABEL_COL = "y"
numeric_cols = df_out.select_dtypes(include=[np.number]).columns
print("Is label in numeric_cols?", LABEL_COL in numeric_cols)
X = df_out[numeric_cols].drop(columns=[LABEL_COL])
y = df_out[LABEL_COL]
print("X shape:", X.shape)
print("y shape:", y.shape)
Is label in numeric_cols? True X shape: (7193, 165) y shape: (7193,)
Data Integration¶
In [35]:
print("Integrated dataset shape:", df.shape)
df.head()
Integrated dataset shape: (7193, 169)
Out[35]:
| session | t_start | t_end | y | gps_acc_x_dashboard_mean | gps_acc_x_dashboard_std | gps_acc_x_dashboard_min | gps_acc_x_dashboard_max | gps_acc_x_dashboard_rms | gps_acc_y_dashboard_mean | ... | speed_drop_mean | speed_drop_std | speed_drop_min | speed_drop_max | speed_drop_rms | proxy_incident_mean | proxy_incident_std | proxy_incident_min | proxy_incident_max | proxy_incident_rms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PVS 1 | 2019-12-24 20:19:56.540 | 2019-12-24 20:19:59.520 | 0 | 0.306168 | 0.016296 | 0.270605 | 0.365176 | 0.306598 | 0.165220 | ... | 0.000099 | 0.000143 | 0.0 | 0.000303 | 0.000173 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | PVS 1 | 2019-12-24 20:19:58.040 | 2019-12-24 20:20:01.020 | 0 | 0.307486 | 0.026242 | 0.223918 | 0.374752 | 0.308596 | 0.163978 | ... | 0.000201 | 0.000913 | 0.0 | 0.007933 | 0.000932 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | PVS 1 | 2019-12-24 20:19:59.540 | 2019-12-24 20:20:02.520 | 0 | 0.306282 | 0.030071 | 0.195187 | 0.385526 | 0.307745 | 0.163780 | ... | 0.002592 | 0.003733 | 0.0 | 0.007933 | 0.004534 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | PVS 1 | 2019-12-24 20:20:01.040 | 2019-12-24 20:20:04.020 | 0 | 0.307264 | 0.024270 | 0.195187 | 0.385526 | 0.308215 | 0.164431 | ... | 0.002487 | 0.003691 | 0.0 | 0.007933 | 0.004441 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | PVS 1 | 2019-12-24 20:20:02.540 | 2019-12-24 20:20:05.520 | 0 | 0.306369 | 0.018382 | 0.263422 | 0.367570 | 0.306916 | 0.163311 | ... | 0.000186 | 0.000305 | 0.0 | 0.000827 | 0.000357 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 169 columns
Feature Selection¶
In [36]:
corr = df[numeric_cols].corr()
plt.figure(figsize=(10,7))
sns.heatmap(corr, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
from sklearn.ensemble import RandomForestClassifier
rf_temp = RandomForestClassifier(n_estimators=200)
rf_temp.fit(X_mm, y)
importances = rf_temp.feature_importances_
feature_importance_df = pd.DataFrame({
"Feature": numeric_cols,
"Importance": importances
}).sort_values("Importance", ascending=False)
feature_importance_df.head(15)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[36], line 10 7 from sklearn.ensemble import RandomForestClassifier 9 rf_temp = RandomForestClassifier(n_estimators=200) ---> 10 rf_temp.fit(X_mm, y) 12 importances = rf_temp.feature_importances_ 13 feature_importance_df = pd.DataFrame({ 14 "Feature": numeric_cols, 15 "Importance": importances 16 }).sort_values("Importance", ascending=False) NameError: name 'X_mm' is not defined
In [39]:
from sklearn.ensemble import RandomForestClassifier
# Make sure df_out exists (it should be your cleaned, clipped DataFrame)
# If not, you can set: df_out = df.copy()
# 1) Select numeric columns and drop label from features
LABEL_COL = "y"
numeric_cols = df_out.select_dtypes(include=[np.number]).columns
feature_cols = [c for c in numeric_cols if c != LABEL_COL]
X_fs = df_out[feature_cols]
y_fs = df_out[LABEL_COL]
print("Feature matrix for selection:", X_fs.shape)
# 2) Train a temporary Random Forest for feature importance
rf_temp = RandomForestClassifier(
n_estimators=200,
random_state=42,
class_weight="balanced"
)
rf_temp.fit(X_fs, y_fs)
# 3) Build importance table
importances = rf_temp.feature_importances_
feature_importance_df = pd.DataFrame({
"Feature": feature_cols,
"Importance": importances
}).sort_values("Importance", ascending=False)
# Show top 15
feature_importance_df.head(15)
Feature matrix for selection: (7193, 165)
Out[39]:
| Feature | Importance | |
|---|---|---|
| 156 | speed_drop_std | 0.118236 |
| 161 | proxy_incident_std | 0.096313 |
| 163 | proxy_incident_max | 0.094612 |
| 158 | speed_drop_max | 0.090670 |
| 164 | proxy_incident_rms | 0.070909 |
| 160 | proxy_incident_mean | 0.052569 |
| 159 | speed_drop_rms | 0.042564 |
| 155 | speed_drop_mean | 0.040207 |
| 132 | gps_temp_below_suspension_min | 0.031928 |
| 151 | gps_speed_std | 0.027612 |
| 99 | gps_mag_y_dashboard_rms | 0.024630 |
| 95 | gps_mag_y_dashboard_mean | 0.020392 |
| 130 | gps_temp_below_suspension_mean | 0.018697 |
| 148 | gps_longitude_max | 0.017120 |
| 82 | gps_gyro_y_below_suspension_min | 0.016466 |
In [40]:
top_n = 15
plt.figure(figsize=(10,6))
sns.barplot(
data=feature_importance_df.head(top_n),
x="Importance", y="Feature"
)
plt.title("Top Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()
Feature Extraction¶
In [42]:
import numpy as np
df_feat = df_out.copy()
acc_cols = [c for c in df_feat.columns if "acc" in c.lower()]
gyro_cols = [c for c in df_feat.columns if "gyro" in c.lower()]
print("Number of accelerometer-related features:", len(acc_cols))
print("Number of gyroscope-related features:", len(gyro_cols))
if acc_cols:
df_feat["acc_features_mean"] = df_feat[acc_cols].mean(axis=1)
df_feat["acc_features_max"] = df_feat[acc_cols].max(axis=1)
df_feat["acc_features_std"] = df_feat[acc_cols].std(axis=1)
else:
print("No accelerometer-related columns found for aggregation.")
if gyro_cols:
df_feat["gyro_features_mean"] = df_feat[gyro_cols].mean(axis=1)
df_feat["gyro_features_max"] = df_feat[gyro_cols].max(axis=1)
df_feat["gyro_features_std"] = df_feat[gyro_cols].std(axis=1)
else:
print("No gyroscope-related columns found for aggregation.")
print("New feature columns added:")
new_cols = [c for c in df_feat.columns if "features_" in c]
print(new_cols)
df_feat[new_cols].head()
Number of accelerometer-related features: 45 Number of gyroscope-related features: 45 New feature columns added: ['acc_features_mean', 'acc_features_max', 'acc_features_std', 'gyro_features_mean', 'gyro_features_max', 'gyro_features_std']
Out[42]:
| acc_features_mean | acc_features_max | acc_features_std | gyro_features_mean | gyro_features_max | gyro_features_std | |
|---|---|---|---|---|---|---|
| 0 | 2.780080 | 10.004245 | 4.324094 | 0.156156 | 3.960586 | 0.617627 |
| 1 | 2.775642 | 10.004245 | 4.328600 | 0.144142 | 3.884292 | 0.615608 |
| 2 | 2.775775 | 10.112569 | 4.331216 | 0.033942 | 0.789833 | 0.292524 |
| 3 | 2.774114 | 10.112569 | 4.331920 | 0.031751 | 0.690651 | 0.278834 |
| 4 | 2.771142 | 10.004843 | 4.329112 | 0.101573 | 2.633072 | 0.428303 |
Assignment 5 - Modeling & Evaluation¶
Evaluation helper function¶
In [24]:
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix, classification_report
)
def evaluate_model(name, y_true, y_pred, y_proba):
print(f"\n===== {name} =====")
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall :", recall_score(y_true, y_pred, zero_division=0))
print("F1 Score :", f1_score(y_true, y_pred, zero_division=0))
print("ROC AUC :", roc_auc_score(y_true, y_proba))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title(f"Confusion Matrix – {name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Logistic Regression - Baseline Model¶
In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(
class_weight='balanced',
max_iter=2000,
random_state=RANDOM_STATE
)
lr.fit(X_train_std, y_train)
y_pred_lr = lr.predict(X_test_std)
y_proba_lr = lr.predict_proba(X_test_std)[:, 1]
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
===== Logistic Regression =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
Random Foreset Classifier (Tree-Based)¶
In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
n_estimators=400,
max_depth=None,
min_samples_split=2,
class_weight='balanced',
n_jobs=-1,
random_state=RANDOM_STATE
)
rf.fit(X_train_mm, y_train)
y_pred_rf = rf.predict(X_test_mm)
y_proba_rf = rf.predict_proba(X_test_mm)[:, 1]
evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)
===== Random Forest =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
Gradient Boosting Classifier¶
In [27]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(
n_estimators=250,
learning_rate=0.1,
max_depth=3,
random_state=RANDOM_STATE
)
gb.fit(X_train_mm, y_train)
y_pred_gb = gb.predict(X_test_mm)
y_proba_gb = gb.predict_proba(X_test_mm)[:, 1]
evaluate_model("Gradient Boosting", y_test, y_pred_gb, y_proba_gb)
===== Gradient Boosting =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
SVM with RBF Kernel¶
In [28]:
from sklearn.svm import SVC
svm = SVC(
kernel='rbf',
C=1.0,
gamma='scale',
class_weight='balanced',
probability=True,
random_state=RANDOM_STATE
)
svm.fit(X_train_std, y_train)
y_pred_svm = svm.predict(X_test_std)
y_proba_svm = svm.predict_proba(X_test_std)[:, 1]
evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_proba_svm)
===== SVM (RBF) =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
Comparison Table - Accuracy, Precision, Recall, F1, ROC-AUC¶
In [29]:
def get_scores(y_true, y_pred, y_proba):
return {
"Accuracy": accuracy_score(y_true, y_pred),
"Precision": precision_score(y_true, y_pred, zero_division=0),
"Recall": recall_score(y_true, y_pred, zero_division=0),
"F1": f1_score(y_true, y_pred, zero_division=0),
"ROC_AUC": roc_auc_score(y_true, y_proba)
}
results = []
results.append(("Logistic Regression", get_scores(y_test, y_pred_lr, y_proba_lr)))
results.append(("Random Forest", get_scores(y_test, y_pred_rf, y_proba_rf)))
results.append(("Gradient Boosting", get_scores(y_test, y_pred_gb, y_proba_gb)))
results.append(("SVM (RBF)", get_scores(y_test, y_pred_svm, y_proba_svm)))
rows = []
for name, scores in results:
row = {"Model": name}
row.update(scores)
rows.append(row)
results_df = pd.DataFrame(rows)
display(results_df.sort_values(by="F1", ascending=False))
| Model | Accuracy | Precision | Recall | F1 | ROC_AUC | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 1 | Random Forest | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 2 | Gradient Boosting | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 3 | SVM (RBF) | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
Error Analysis - False Positives & False Negatives¶
In [30]:
best_pred = y_pred_rf # change if another model is best
false_negatives_idx = np.where((y_test == 1) & (best_pred == 0))[0]
false_positives_idx = np.where((y_test == 0) & (best_pred == 1))[0]
print("False Negatives (missed accidents):", len(false_negatives_idx))
print("False Positives (false alarms):", len(false_positives_idx))
False Negatives (missed accidents): 0 False Positives (false alarms): 0
Model Evaluation Function¶
In [31]:
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix, classification_report
)
def evaluate_model(name, y_test, y_pred, y_proba):
print(f"\n===== {name} =====")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall :", recall_score(y_test, y_pred, zero_division=0))
print("F1 Score :", f1_score(y_test, y_pred, zero_division=0))
print("ROC AUC :", roc_auc_score(y_test, y_proba))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title(f"Confusion Matrix – {name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
In [32]:
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)
evaluate_model("Gradient Boosting", y_test, y_pred_gb, y_proba_gb)
evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_proba_svm)
===== Logistic Regression =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
===== Random Forest =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
===== Gradient Boosting =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
===== SVM (RBF) =====
Accuracy : 1.0
Precision: 1.0
Recall : 1.0
F1 Score : 1.0
ROC AUC : 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1437
1 1.00 1.00 1.00 2
accuracy 1.00 1439
macro avg 1.00 1.00 1.00 1439
weighted avg 1.00 1.00 1.00 1439
Ranking Models¶
In [33]:
model_scores = {
"Logistic Regression": f1_score(y_test, y_pred_lr),
"Random Forest": f1_score(y_test, y_pred_rf),
"Gradient Boosting": f1_score(y_test, y_pred_gb),
"SVM": f1_score(y_test, y_pred_svm)
}
pd.DataFrame(model_scores.items(), columns=["Model", "F1 Score"]).sort_values("F1 Score", ascending=False)
Out[33]:
| Model | F1 Score | |
|---|---|---|
| 0 | Logistic Regression | 1.0 |
| 1 | Random Forest | 1.0 |
| 2 | Gradient Boosting | 1.0 |
| 3 | SVM | 1.0 |
Hyperparameter Tuning¶
In [34]:
param_grid_rf = {
"n_estimators": [200, 400],
"max_depth": [None, 10, 20],
"min_samples_split": [2, 5],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rf_tuned = GridSearchCV(
estimator=RandomForestClassifier(
class_weight="balanced",
random_state=RANDOM_STATE,
n_jobs=-1
),
param_grid=param_grid_rf,
scoring="f1", # focus on F1 for accident class
cv=cv,
n_jobs=-1,
verbose=1
)
rf_tuned.fit(X_train_mm, y_train)
print("Best parameters:", rf_tuned.best_params_)
print("Best CV F1-score:", rf_tuned.best_score_)
best_rf = rf_tuned.best_estimator_
y_pred_rf_best = best_rf.predict(X_test_mm)
y_proba_rf_best = best_rf.predict_proba(X_test_mm)[:, 1]
eval_model("Random Forest (Tuned)", y_test, y_pred_rf_best, y_proba_rf_best)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best CV F1-score: 0.9333333333333332
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[34], line 30 27 y_pred_rf_best = best_rf.predict(X_test_mm) 28 y_proba_rf_best = best_rf.predict_proba(X_test_mm)[:, 1] ---> 30 eval_model("Random Forest (Tuned)", y_test, y_pred_rf_best, y_proba_rf_best) NameError: name 'eval_model' is not defined